From 36c7274ab0fcb08b1380fe07099c035fb31cacdb Mon Sep 17 00:00:00 2001 From: Nilstrieb <48135649+Nilstrieb@users.noreply.github.com> Date: Mon, 20 Nov 2023 20:22:34 +0100 Subject: [PATCH] lex lex hex hex --- .gitignore | 1 + Cargo.lock | 9 +++ Cargo.toml | 1 + src/lib.rs | 36 +++++++++ src/main.rs | 4 +- src/parse.rs | 216 +++++++++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 265 insertions(+), 2 deletions(-) create mode 100644 src/lib.rs create mode 100644 src/parse.rs diff --git a/.gitignore b/.gitignore index ea8c4bf..d8db536 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ /target +/example*.sh \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 40681c0..8ef3c44 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,15 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "anyhow" +version = "1.0.75" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4668cab20f66d8d020e1fbc0ebe47217433c1b6c8f2040faf858554e394ace6" + [[package]] name = "brash" version = "0.1.0" +dependencies = [ + "anyhow", +] diff --git a/Cargo.toml b/Cargo.toml index 358b5cd..d8b8667 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,3 +6,4 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] +anyhow = "1.0.75" diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..69b1248 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,36 @@ +pub mod parse; + +use std::sync::Arc; + +use anyhow::{anyhow, bail, Context, Result}; + +pub fn bash_it(args: impl Iterator) -> Result<()> { + let (filename, src) = parse_args_into_src(args)?; + + parse::parse(filename, &src)?; + + Ok(()) +} + +fn parse_args_into_src(args: impl Iterator) -> Result<(Arc, String)> { + let mut src = None; + let mut c = false; + + for arg in args { + if src.is_some() { + bail!("usage: brash [FILE]"); + } + if c { + src = Some(("".into(), arg)); + } else if arg == "-c" { + c = true; + } else { + src = Some(( + arg.clone().into(), + std::fs::read_to_string(&arg).with_context(|| format!("opening {arg}"))?, + )); + } + } + + src.ok_or_else(|| anyhow!("usage: brash [FILE]")) +} diff --git a/src/main.rs b/src/main.rs index e7a11a9..5a72a4d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,3 +1,3 @@ -fn main() { - println!("Hello, world!"); +fn main() -> anyhow::Result<()> { + brash::bash_it(std::env::args()) } diff --git a/src/parse.rs b/src/parse.rs new file mode 100644 index 0000000..c1d84f8 --- /dev/null +++ b/src/parse.rs @@ -0,0 +1,216 @@ +//! https://www.gnu.org/software/bash/manual/bash.html + +use std::{borrow::Cow, iter, sync::Arc}; + +use anyhow::{bail, Result}; + +pub fn parse(filename: Arc, src: &str) -> Result<()> { + let tokens = lex(filename, src)?; + dbg!(tokens); + Ok(()) +} + +#[derive(Debug)] +pub struct Span { + pub filename: Arc, + pub start: u32, + pub end: u32, +} + +#[derive(Debug)] +pub struct Token<'a> { + pub value: Cow<'a, str>, + pub restriction: Restriction, + pub span: Span, +} + +#[derive(Debug, Clone, Copy, PartialEq)] +pub enum Restriction { + No, + DoubleQuot, + SingleQuot, +} + +struct Lexer<'a> { + filename: Arc, + src: &'a str, + bytes: iter::Enumerate>, + tokens: Vec>, + cloned_escaped: Option, + last_start: u32, +} + +impl<'a> Lexer<'a> { + fn span_start_end(&self, start: u32, end: u32) -> Span { + Span { + filename: self.filename.clone(), + start, + end, + } + } + + fn handle_char(&mut self, i: u32, b: u8) -> Result<()> { + match b { + _ if is_metacharacter(b) => { + self.commit(i); + if b != b' ' && b != b'\t' && b != b'\n' { + self.tokens.push(Token { + value: self.src[(i as usize)..(i as usize + 1)].into(), + restriction: Restriction::No, + span: self.span_start_end(i, i + 1), + }); + } + } + b'\\' => { + let Some((_, next)) = self.bytes.next() else { + bail!("Trailing \\ in the file {}", self.filename); + }; + match &mut self.cloned_escaped { + Some(clone_escaped) if next != b'\n' => clone_escaped.push(next as char), + Some(_) => {} + cloned_escaped @ None => { + let mut value = + self.src[(self.last_start as usize)..(i as usize)].to_owned(); + if next != b'\n' { + value.push(next as char); + } + *cloned_escaped = Some(value); + } + } + } + _ => { + if let Some(cloned_escaped) = &mut self.cloned_escaped { + cloned_escaped.push(b as char); + } + } + } + Ok(()) + } + + fn commit(&mut self, i: u32) { + let span = self.span_start_end(self.last_start, i); + let token = match self.cloned_escaped.take() { + None => Token { + value: self.src[(self.last_start as usize)..(i as usize)].into(), + restriction: Restriction::No, + span, + }, + Some(cloned) => Token { + value: cloned.clone().into(), + restriction: Restriction::No, + span, + }, + }; + self.finish_word(i, token); + } + + fn finish_word(&mut self, i: u32, token: Token<'a>) { + self.cloned_escaped = None; + self.last_start = i + 1; + if token.value.starts_with('#') { + while let Some((i, b)) = self.bytes.next() { + if b == b'\n' { + self.last_start = i as u32 + 1; + return; + } + } + // EOF + self.last_start = self.src.len() as u32; + } else { + self.tokens.push(token); + } + } +} + +fn lex(filename: Arc, src: &str) -> Result>> { + let mut lexer = Lexer { + filename, + src, + bytes: src.bytes().enumerate(), + tokens: Vec::new(), + cloned_escaped: None, + last_start: 0, + }; + + while let Some((i, b)) = lexer.bytes.next() { + let Ok(i) = i.try_into() else { + bail!("file {} larger than 4GB", lexer.filename); + }; + lexer.handle_char(i, b)?; + } + + if lexer.last_start != (src.len() as u32) { + lexer.commit(src.len() as u32); + } + + Ok(lexer.tokens) +} + +fn is_metacharacter(c: u8) -> bool { + [b' ', b'\t', b'\n', b'|', b'&', b';', b'(', b')', b'<', b'>'].contains(&c) +} + +#[cfg(test)] +mod tests { + mod lex { + use crate::parse::Restriction::{self, *}; + + fn test_eq(src: &str, tokens: impl AsRef<[(&'static str, Restriction)]>) { + let actual = super::super::lex("whatever".into(), src).unwrap(); + let to_compare: Vec<_> = actual + .iter() + .map(|tok| (tok.value.as_ref(), tok.restriction)) + .collect(); + + assert_eq!(tokens.as_ref(), to_compare); + } + + #[test] + fn hello_world() { + test_eq("Hello, world!", [("Hello,", No), ("world!", No)]); + } + + #[test] + fn newline_var() { + test_eq("echo $a\nb", [("echo", No), ("$a", No), ("b", No)]) + } + + #[test] + fn newline_var_escape() { + test_eq("echo $a\\\nb", [("echo", No), ("$ab", No)]) + } + + #[test] + fn metachars() { + test_eq( + "hello;world)yes", + [ + ("hello", No), + (";", No), + ("world", No), + (")", No), + ("yes", No), + ], + ) + } + + #[test] + fn comment() { + test_eq("hi # no", [("hi", No)]); + } + + #[test] + #[ignore = "TODO: this is buggy"] + fn comment_escaped_newline() { + test_eq("#test\\\nno", [("no", No)]); + } + + #[test] + fn strange_comment() { + test_eq( + "no#true hello;#yes", + [("no#true", No), ("hello", No), (";", No)], + ); + } + } +}