From ca33e5f0c5cc3c09d05374fc6782b9c1b04d35ce Mon Sep 17 00:00:00 2001 From: nils <48135649+Nilstrieb@users.noreply.github.com> Date: Tue, 21 Jun 2022 09:04:00 +0200 Subject: [PATCH] lex lex --- Cargo.lock | 11 +++ Cargo.toml | 3 + parser/src/lexer.rs | 214 ++++++++++++++++++++++++++++++++++++++++++++ parser/src/lib.rs | 15 +--- 4 files changed, 230 insertions(+), 13 deletions(-) create mode 100644 Cargo.lock create mode 100644 parser/src/lexer.rs diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..cb515fb --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,11 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "parser" +version = "0.1.0" + +[[package]] +name = "uwuc" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml index bc59071..e50bf1a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,3 +1,6 @@ +[workspace] +members = [".", "parser"] + [package] name = "uwuc" version = "0.1.0" diff --git a/parser/src/lexer.rs b/parser/src/lexer.rs new file mode 100644 index 0000000..8ed2b36 --- /dev/null +++ b/parser/src/lexer.rs @@ -0,0 +1,214 @@ +//! Contrary to popular belief, Dennis Ritchie did not invent the C grammar. +//! The C grammar was brought to Dennis Ritchie by a demon in hos worst dreams + +pub enum PToken { + HeaderName(Vec), + Identifier(Vec), + PpNumber(Vec), + CharConstant, + StringLiteral(Vec), + Punctuator(Punctuator), + OtherNonWs(u8), + Error, +} + +pub enum Token { + Keyword(Keyword), + Identifier(), + Constant(), + StringLiteral(), + Punctuator(Punctuator), +} + +pub enum Keyword {} + +pub enum Constant { + Integer(i64), +} + +pub enum Punctuator { + /// [ <: + BracketOpen, + /// ] :> + BracketClose, + /// ( + ParenOpen, + /// ) + ParenClose, + /// { <% + BraceOpen, + /// } %> + BraceClose, + /// . + Dot, + /// -> + Arrow, + /// ++ + PlusPlus, + /// -- + MinusMinus, + /// & + Ampersand, + /// * + Asterisk, + /// + + Plus, + /// - + Minus, + /// ~ + Tilde, + /// ! 🔫 + Bang, + //// % + Percent, + /// << + LeftLeftChevron, + /// >> + RightRightChevron, + /// < + LeftChevron, + /// > + RightChevron, + /// <= + LeftChevronEqual, + /// >= + RightChevronEqual, + /// == + EqualEqual, + /// != + BangEqual, + /// ^ + Caret, + /// | + Pipe, + /// && + AmpersandAmpersand, + /// || + PipePipe, + /// ? + QuestionMark, + /// : + Colon, + /// ; + Semicolon, + /// ... + DotDotDot, + /// = + Equal, + /// *= + AsteriskEqual, + /// /= + SlashEqual, + /// %= + PercentEqual, + /// += + PlusEqual, + /// -= + MinusEqual, + /// <<= + LeftLeftChevronEqual, + /// >>= + RightRightChevronEqual, + /// &= + AmspersandEqual, + /// ^= + CaretEqual, + /// |= + PipeEqual, + /// , + Comman, + /// # %: + Hash, + /// ## %:%: + HashHash, +} + +struct PLexer +where + I: Iterator, +{ + src: std::iter::Peekable, +} + +impl PLexer +where + I: Iterator, +{ + /// 6.4.2 Identifiers + /// TODO: 6.4.3 Universal character names + fn identifier(&mut self, c: u8) -> PToken { + let mut ident = vec![c]; + + while let Some(&c) = self.src.peek() { + if let b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'0'..=b'9' = c { + self.src.next(); + ident.push(c); + } else { + break; + } + } + + PToken::Identifier(ident) + } + + /// 6.4.8 Preprocessing numbers + fn number(&mut self, c: u8) -> PToken { + let mut number = vec![c]; + + while let Some(&c) = self.src.peek() { + if let b'0'..=b'9' = c { + self.src.next(); + number.push(c); + } else { + break; + } + } + + PToken::PpNumber(number) + } + + /// 6.4.5 String literals + fn string_literal(&mut self) -> PToken { + let mut string = Vec::new(); + + while let c @ b'"' = { + match self.src.next() { + Some(next) => next, + None => return PToken::Error, + } + } { + string.push(c); + } + PToken::StringLiteral(string) + } +} + +impl<'src, I> Iterator for PLexer +where + I: Iterator, +{ + type Item = PToken; + + /// preprocessing-token: + /// header-name + /// identifier + /// pp-number + /// character-constant + /// string-literal + /// punctuator + /// each non-white-space character that cannot be one of the above + fn next(&mut self) -> Option { + loop { + match self.src.next()? { + c @ (b'a'..=b'z' | b'A'..=b'Z' | b'_') => { + return Some(self.identifier(c)); + } + c @ b'0'..=b'9' => return Some(self.number(c)), + b'"' => return Some(self.string_literal()), + b'[' => return Some(PToken::Punctuator(Punctuator::BraceOpen)), + c if c.is_ascii_whitespace() => {} + c => return Some(PToken::OtherNonWs(c)), + } + } + } +} diff --git a/parser/src/lib.rs b/parser/src/lib.rs index 7d12d9a..38f258a 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -1,14 +1,3 @@ -pub fn add(left: usize, right: usize) -> usize { - left + right -} +#![allow(dead_code)] -#[cfg(test)] -mod tests { - use super::*; - - #[test] - fn it_works() { - let result = add(2, 2); - assert_eq!(result, 4); - } -} +mod lexer;