From 9e87a4ce910b1ce2251daedcc910b2697fbc135a Mon Sep 17 00:00:00 2001 From: Nilstrieb <48135649+Nilstrieb@users.noreply.github.com> Date: Mon, 21 Mar 2022 14:59:18 +0100 Subject: [PATCH] lexer! --- Cargo.lock | 170 ++++++++++++++++++ ub_parser/Cargo.toml | 3 + ub_parser/src/ast.rs | 1 + ub_parser/src/lexer.rs | 82 ++++++++- .../ub_parser__lexer__tests__idents.snap | 15 ++ .../ub_parser__lexer__tests__keywords.snap | 14 ++ .../ub_parser__lexer__tests__literals.snap | 18 ++ .../ub_parser__lexer__tests__punctuation.snap | 31 ++++ .../ub_parser__lexer__tests__whitespace.snap | 8 + ub_parser/src/span.rs | 17 ++ 10 files changed, 355 insertions(+), 4 deletions(-) create mode 100644 ub_parser/src/snapshots/ub_parser__lexer__tests__idents.snap create mode 100644 ub_parser/src/snapshots/ub_parser__lexer__tests__keywords.snap create mode 100644 ub_parser/src/snapshots/ub_parser__lexer__tests__literals.snap create mode 100644 ub_parser/src/snapshots/ub_parser__lexer__tests__punctuation.snap create mode 100644 ub_parser/src/snapshots/ub_parser__lexer__tests__whitespace.snap diff --git a/Cargo.lock b/Cargo.lock index 1c1d9d4..0349f6c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -11,6 +11,12 @@ dependencies = [ "const-random", ] +[[package]] +name = "autocfg" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" + [[package]] name = "beef" version = "0.5.1" @@ -32,6 +38,19 @@ dependencies = [ "ahash", ] +[[package]] +name = "console" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a28b32d32ca44b70c3e4acd7db1babf555fa026e385fb95f18028f88848b3c31" +dependencies = [ + "encode_unicode", + "libc", + "once_cell", + "terminal_size", + "winapi", +] + [[package]] name = "const-random" version = "0.1.13" @@ -60,6 +79,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +[[package]] +name = "encode_unicode" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a357d28ed41a50f9c765dbfe56cbc04a64e53e5fc58ba79fbc34c10ef3df831f" + [[package]] name = "fnv" version = "1.0.7" @@ -77,6 +102,42 @@ dependencies = [ "wasi", ] +[[package]] +name = "hashbrown" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" + +[[package]] +name = "indexmap" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282a6247722caba404c065016bbfa522806e51714c34f5dfc3e4a3a46fcb4223" +dependencies = [ + "autocfg", + "hashbrown", +] + +[[package]] +name = "insta" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a7e1911532a662f6b08b68f884080850f2fd9544963c3ab23a5af42bda1eac" +dependencies = [ + "console", + "once_cell", + "serde", + "serde_json", + "serde_yaml", + "similar", +] + +[[package]] +name = "itoa" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" + [[package]] name = "lazy_static" version = "1.4.0" @@ -89,6 +150,12 @@ version = "0.2.121" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efaa7b300f3b5fe8eb6bf21ce3895e1751d9665086af2d64b42f19701015ff4f" +[[package]] +name = "linked-hash-map" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fb9b38af92608140b86b693604b9ffcc5824240a484d1ecd4795bacb2fe88f3" + [[package]] name = "logos" version = "0.12.0" @@ -113,6 +180,12 @@ dependencies = [ "utf8-ranges", ] +[[package]] +name = "once_cell" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87f3e037eac156d1775da914196f0f37741a274155e34a0b7e427c35d2a2ecb9" + [[package]] name = "proc-macro-hack" version = "0.5.19" @@ -143,6 +216,61 @@ version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" +[[package]] +name = "ryu" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" + +[[package]] +name = "serde" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ce31e24b01e1e524df96f1c2fdd054405f8d7376249a5110886fb4b658484789" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.136" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08597e7152fcd306f41838ed3e37be9eaeed2b61c42e2117266a554fab4662f9" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "serde_json" +version = "1.0.79" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e8d9fa5c3b304765ce1fd9c4c8a3de2c8db365a5b91be52f186efc675681d95" +dependencies = [ + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_yaml" +version = "0.8.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4a521f2940385c165a24ee286aa8599633d162077a54bdcae2a6fd5a7bfa7a0" +dependencies = [ + "indexmap", + "ryu", + "serde", + "yaml-rust", +] + +[[package]] +name = "similar" +version = "2.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e24979f63a11545f5f2c60141afe249d4f19f84581ea2138065e400941d83d3" + [[package]] name = "syn" version = "1.0.89" @@ -154,6 +282,16 @@ dependencies = [ "unicode-xid", ] +[[package]] +name = "terminal_size" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "633c1a546cee861a1a6d0dc69ebeca693bf4296661ba7852b9d21d159e0506df" +dependencies = [ + "libc", + "winapi", +] + [[package]] name = "tiny-keccak" version = "2.0.2" @@ -175,6 +313,7 @@ name = "ub_parser" version = "0.1.0" dependencies = [ "chumsky", + "insta", "logos", ] @@ -195,3 +334,34 @@ name = "wasi" version = "0.10.2+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "yaml-rust" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56c1936c4cc7a1c9ab21a1ebb602eb942ba868cbd44a99cb7cdc5892335e1c85" +dependencies = [ + "linked-hash-map", +] diff --git a/ub_parser/Cargo.toml b/ub_parser/Cargo.toml index d22117e..680a077 100644 --- a/ub_parser/Cargo.toml +++ b/ub_parser/Cargo.toml @@ -8,3 +8,6 @@ edition = "2021" [dependencies] chumsky = "0.8.0" logos = "0.12.0" + +[dev-dependencies] +insta = "1.13.0" diff --git a/ub_parser/src/ast.rs b/ub_parser/src/ast.rs index eb4e90e..600116a 100644 --- a/ub_parser/src/ast.rs +++ b/ub_parser/src/ast.rs @@ -137,6 +137,7 @@ pub enum BinOpKind { Or, BitAnd, BitOr, + Xor, } #[derive(Debug, Clone, PartialEq)] diff --git a/ub_parser/src/lexer.rs b/ub_parser/src/lexer.rs index 34ac59b..fa4b887 100644 --- a/ub_parser/src/lexer.rs +++ b/ub_parser/src/lexer.rs @@ -1,7 +1,10 @@ use logos::Logos; #[derive(Logos, Debug, PartialEq)] -pub enum Token { +pub enum Token<'a> { + #[regex("//[^\n]*", logos::skip)] + Comment, + // punctuation #[token("{")] BraceO, @@ -17,6 +20,10 @@ pub enum Token { ParenC, #[token(".")] Dot, + #[token(",")] + Comma, + #[token(";")] + Semi, #[token("=")] Eq, #[token("==")] @@ -41,21 +48,88 @@ pub enum Token { Plus, #[token("-")] Minus, + #[token("|")] + Or, + #[token("&")] + And, + #[token("||")] + OrOr, + #[token("&&")] + AndAnd, + #[token("^")] + Caret, // keywords #[token("struct")] Struct, #[token("fn")] Fn, + #[token("if")] + If, + #[token("else")] + Else, + #[token("while")] + While, + #[token("loop")] + Loop, #[regex(r"[a-zA-Z_]\w*")] - Ident(String), + Ident(&'a str), + + #[regex(r##""[^"]*""##)] + String(&'a str), + + #[regex(r"\d+")] + Integer(&'a str), #[error] - #[regex(r"[ \t\n\r\f]+"), logos::skip] + #[regex(r"[ \t\r\n]+", logos::skip)] Error, } -pub fn lex(code: &str) -> logos::Lexer<'_, Token> { +pub fn lex<'src>(code: &'src str) -> logos::Lexer<'_, Token<'src>> { Token::lexer(code) } + +#[cfg(test)] +mod tests { + use crate::lexer::Token; + + fn lex_test(str: &str) -> Vec> { + let lexer = super::lex(str); + lexer.collect() + } + + #[test] + fn punctuation() { + let tokens = lex_test("{} [] () .,; = == != >= <= < > + - * / | || & && ^"); + insta::assert_debug_snapshot!(tokens); + } + + #[test] + fn whitespace() { + let tokens = lex_test( + ". + \r\n \t .", + ); + insta::assert_debug_snapshot!(tokens); + } + + #[test] + fn idents() { + let tokens = lex_test("hello w_world b235_"); + insta::assert_debug_snapshot!(tokens); + } + + #[test] + fn literals() { + let tokens = lex_test(r##""hello friend" 5 "morning" 3263475"##); + insta::assert_debug_snapshot!(tokens); + } + + #[test] + fn keywords() { + let tokens = lex_test("struct fn . if else while loop;"); + insta::assert_debug_snapshot!(tokens); + } +} diff --git a/ub_parser/src/snapshots/ub_parser__lexer__tests__idents.snap b/ub_parser/src/snapshots/ub_parser__lexer__tests__idents.snap new file mode 100644 index 0000000..ebf4642 --- /dev/null +++ b/ub_parser/src/snapshots/ub_parser__lexer__tests__idents.snap @@ -0,0 +1,15 @@ +--- +source: ub_parser/src/lexer.rs +expression: tokens +--- +[ + Ident( + "hello", + ), + Ident( + "w_world", + ), + Ident( + "b235_", + ), +] diff --git a/ub_parser/src/snapshots/ub_parser__lexer__tests__keywords.snap b/ub_parser/src/snapshots/ub_parser__lexer__tests__keywords.snap new file mode 100644 index 0000000..1fce087 --- /dev/null +++ b/ub_parser/src/snapshots/ub_parser__lexer__tests__keywords.snap @@ -0,0 +1,14 @@ +--- +source: ub_parser/src/lexer.rs +expression: tokens +--- +[ + Struct, + Fn, + Dot, + If, + Else, + While, + Loop, + Semi, +] diff --git a/ub_parser/src/snapshots/ub_parser__lexer__tests__literals.snap b/ub_parser/src/snapshots/ub_parser__lexer__tests__literals.snap new file mode 100644 index 0000000..6de01fe --- /dev/null +++ b/ub_parser/src/snapshots/ub_parser__lexer__tests__literals.snap @@ -0,0 +1,18 @@ +--- +source: ub_parser/src/lexer.rs +expression: tokens +--- +[ + String( + "\"hello friend\"", + ), + Integer( + "5", + ), + String( + "\"morning\"", + ), + Integer( + "3263475", + ), +] diff --git a/ub_parser/src/snapshots/ub_parser__lexer__tests__punctuation.snap b/ub_parser/src/snapshots/ub_parser__lexer__tests__punctuation.snap new file mode 100644 index 0000000..ffa69d0 --- /dev/null +++ b/ub_parser/src/snapshots/ub_parser__lexer__tests__punctuation.snap @@ -0,0 +1,31 @@ +--- +source: ub_parser/src/lexer.rs +expression: tokens +--- +[ + BraceO, + BraceC, + BracketO, + BracketC, + ParenO, + ParenC, + Dot, + Comma, + Semi, + Eq, + EqEq, + BangEq, + GreaterEq, + LessEq, + Less, + Greater, + Plus, + Minus, + Asterisk, + Slash, + Or, + OrOr, + And, + AndAnd, + Caret, +] diff --git a/ub_parser/src/snapshots/ub_parser__lexer__tests__whitespace.snap b/ub_parser/src/snapshots/ub_parser__lexer__tests__whitespace.snap new file mode 100644 index 0000000..7582c2c --- /dev/null +++ b/ub_parser/src/snapshots/ub_parser__lexer__tests__whitespace.snap @@ -0,0 +1,8 @@ +--- +source: ub_parser/src/lexer.rs +expression: tokens +--- +[ + Dot, + Dot, +] diff --git a/ub_parser/src/span.rs b/ub_parser/src/span.rs index 9845d00..883bc42 100644 --- a/ub_parser/src/span.rs +++ b/ub_parser/src/span.rs @@ -1,5 +1,22 @@ +use std::ops::Range; + #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)] pub struct Span { start: usize, len: usize, } + +impl Span { + pub fn start_end(start: usize, end: usize) -> Self { + Self { + start, + len: end - start, + } + } +} + +impl From> for Span { + fn from(r: Range) -> Self { + Self::start_end(r.start, r.end) + } +}