//! //! Compiles the regex AST into a finite state machine //! //! The basic idea is to return the index and char transition to the new node, which will then //! be put into the transitions of the current one //! //! Example regex: /a|b*c/ //! //! Using {x} to reference items in the text below //! //! Has the AST //! ```txt //! Choice {1} //! |-- Char(a) //! |-- Seq {4} //! | -- Repeat(Char(b)) {5} //! | -- Char(c) {6} //! ``` //! Should compile into the following state machine //! //! ```txt //! {3} {2} //! /--a--(()) //! ( START ) //! \--b---()--c--(()) //! /\ //! b_| //! ``` //! //! For that, we compile each node individually //! //! Compiling the choice node {2} will get us all nodes and transitions we need //! Compiling the first char node will add a single node {2}. //! This compilation will then return the index to this node and also the char it needs, //! being the char that is contained in this char node. The choice node {2} will then //! add a transition {3} to the start node it created. //! Note that the choice node {2} does not now that it is the start node. //! //! The same is being done for the second child of the choice, although it's a bit more //! complicated for that one. //! First we compile the seq node {4}. This will directly lead to compiling it's two child nodes, //! ({5}, {6}). //! Compiling the repeat node {5} returns it's index and also the char that leads to it. //! The char that leads to a repeat node is the one it repeats. //! For the char node {6}, it's very similar to the char node below the choice node {1}. //! //! Another example: /u(w|o)!/ //! //! AST: //! ```txt //! Seq //! |-- Char(u) //! |-- Choice //! |-- Char(w) //! |-- Char(o) //! |-- Char(!) //! ``` //! //! ```txt //! //! /-w--()--\ //! ( START )--u--() |--!-(()) //! \-o--()--/ //! //! ``` //! //! //! AST nodes will become transitions in the FSM //! FSM nodes are the connections in the AST //! //! This architecture mostly seems to work out, with the only problem currently being allocating nodes //! this appears to be something not every kind of regex part does. //! //! A char will allocate the node for its transition. //! A seq won't to that, because the contents of the seq allocate everything, the seq is just a wrapper. //! Now the question is: is seq unique and should be special cased, or can something like it exist? //! //! Does choice allocate a node? No, it does not, it only branches. So allocating seems like something //! that some kinds do, but not all of them. //! //! So allocating is something that is not fundamental to the compilation, but handled by each node. type NodeIndex = usize; impl Compiler { /// This function takes the node index of the previous node, constructs a new one as the target, /// and then creates a transition from the previous to the new one, containing the condition /// of the AST node it is compiling. /// It returns fn compile(&mut self, regex: &Regex, node_before: NodeIndex) -> NodeIndex { match regex { Regex::Char(char) => self.allocating(node_before, |_, _| TransitionType::Char(*char)), Regex::Sequence(terms) => { if let Some(first) = terms.first() { let trans_to_first = self.compile(first, 0); } else { TransitionType::Always; }; todo!() } Regex::Primitive(primitive) => self.allocating(node_before, |_, _| { TransitionType::Primitive(match primitive { parse::Primitive::Word => Primitive::Word, parse::Primitive::Digit => Primitive::Digit, }) }), Regex::Choice(a, b) => { todo!() } Regex::Repetition(_) => { todo!() } Regex::Set(_) => { todo!() } Regex::Range(_) => { todo!() } } } fn allocating TransitionType>( &mut self, node_before: NodeIndex, f: F, ) -> NodeIndex { let next_node_slot = self.reserve_node_slot(); let mut next_node = Node::default(); let this_condition = f(&mut next_node, next_node_slot); // fill the placeholder with the node we just created, forget the placeholder let _ = std::mem::replace(self.nodes.get_mut(next_node_slot).unwrap(), next_node); self.nodes .get_mut(node_before) .unwrap() .transitions .push(Transition { target_node: next_node_slot, condition: this_condition, }); next_node_slot } } use crate::parse; use crate::parse::Regex; use std::ops::Range; #[derive(Debug, Clone, PartialEq, Eq)] struct Transition { target_node: usize, condition: TransitionType, } #[derive(Debug, Clone, PartialEq, Eq)] enum TransitionType { Range(Range), Primitive(Primitive), Char(char), Always, } #[derive(Debug, Copy, Clone, PartialEq, Eq)] enum Primitive { Word, Digit, } #[derive(Debug, Clone, Default, PartialEq, Eq)] struct Node { end: bool, transitions: Vec, } #[derive(Debug, Clone, PartialEq, Eq)] struct RegexFsm { nodes: Vec, } #[derive(Debug, Default)] struct Compiler { nodes: Vec, } /// Compiles the parsed Regex into a FSM fn compile(regex: &Regex) -> RegexFsm { let mut compiler = Compiler::default(); // reserve the start node compiler.reserve_node_slot(); compiler.compile(regex, 0); RegexFsm { nodes: compiler.nodes, } } impl Compiler { /// Pushes a placeholder node into the internal buffer and returns it's index fn reserve_node_slot(&mut self) -> NodeIndex { self.nodes.push(Node::default()); self.nodes.len() - 1 } } #[cfg(test)] mod test { use crate::compile::{Node, RegexFsm, Transition, TransitionType}; use crate::parse::Regex; /// /// regex: /🌈/ /// fsm: () --🌈-- (()) #[test] fn single_char() { let ast = Regex::Char('🌈'); let fsm = super::compile(&ast); assert_eq!( fsm, RegexFsm { nodes: vec![ Node { end: false, transitions: vec![Transition { target_node: 1, condition: TransitionType::Char('🌈') }] }, Node { end: true, transitions: vec![] } ] } ) } }