mirror of
https://github.com/Noratrieb/regex.git
synced 2026-01-14 16:35:01 +01:00
238 lines
6.8 KiB
Rust
238 lines
6.8 KiB
Rust
//!
|
|
//! Compiles the regex AST into a finite state machine
|
|
//!
|
|
//! The basic idea is to return the index and char transition to the new node, which will then
|
|
//! be put into the transitions of the current one
|
|
//!
|
|
//! Example regex: /a|b*c/
|
|
//!
|
|
//! Using {x} to reference items in the text below
|
|
//!
|
|
//! Has the AST
|
|
//! ```txt
|
|
//! Choice {1}
|
|
//! |-- Char(a)
|
|
//! |-- Seq {4}
|
|
//! | -- Repeat(Char(b)) {5}
|
|
//! | -- Char(c) {6}
|
|
//! ```
|
|
//! Should compile into the following state machine
|
|
//!
|
|
//! ```txt
|
|
//! {3} {2}
|
|
//! /--a--(())
|
|
//! ( START )
|
|
//! \--b---()--c--(())
|
|
//! /\
|
|
//! b_|
|
|
//! ```
|
|
//!
|
|
//! For that, we compile each node individually
|
|
//!
|
|
//! Compiling the choice node {2} will get us all nodes and transitions we need
|
|
//! Compiling the first char node will add a single node {2}.
|
|
//! This compilation will then return the index to this node and also the char it needs,
|
|
//! being the char that is contained in this char node. The choice node {2} will then
|
|
//! add a transition {3} to the start node it created.
|
|
//! Note that the choice node {2} does not now that it is the start node.
|
|
//!
|
|
//! The same is being done for the second child of the choice, although it's a bit more
|
|
//! complicated for that one.
|
|
//! First we compile the seq node {4}. This will directly lead to compiling it's two child nodes,
|
|
//! ({5}, {6}).
|
|
//! Compiling the repeat node {5} returns it's index and also the char that leads to it.
|
|
//! The char that leads to a repeat node is the one it repeats.
|
|
//! For the char node {6}, it's very similar to the char node below the choice node {1}.
|
|
//!
|
|
//! Another example: /u(w|o)!/
|
|
//!
|
|
//! AST:
|
|
//! ```txt
|
|
//! Seq
|
|
//! |-- Char(u)
|
|
//! |-- Choice
|
|
//! |-- Char(w)
|
|
//! |-- Char(o)
|
|
//! |-- Char(!)
|
|
//! ```
|
|
//!
|
|
//! ```txt
|
|
//!
|
|
//! /-w--()--\
|
|
//! ( START )--u--() |--!-(())
|
|
//! \-o--()--/
|
|
//!
|
|
//! ```
|
|
//!
|
|
//!
|
|
//! AST nodes will become transitions in the FSM
|
|
//! FSM nodes are the connections in the AST
|
|
//!
|
|
//! This architecture mostly seems to work out, with the only problem currently being allocating nodes
|
|
//! this appears to be something not every kind of regex part does.
|
|
//!
|
|
//! A char will allocate the node for its transition.
|
|
//! A seq won't to that, because the contents of the seq allocate everything, the seq is just a wrapper.
|
|
//! Now the question is: is seq unique and should be special cased, or can something like it exist?
|
|
//!
|
|
//! Does choice allocate a node? No, it does not, it only branches. So allocating seems like something
|
|
//! that some kinds do, but not all of them.
|
|
//!
|
|
//! So allocating is something that is not fundamental to the compilation, but handled by each node.
|
|
|
|
type NodeIndex = usize;
|
|
|
|
impl Compiler {
|
|
/// This function takes the node index of the previous node, constructs a new one as the target,
|
|
/// and then creates a transition from the previous to the new one, containing the condition
|
|
/// of the AST node it is compiling.
|
|
/// It returns
|
|
fn compile(&mut self, regex: &Regex, node_before: NodeIndex) -> NodeIndex {
|
|
match regex {
|
|
Regex::Char(char) => self.allocating(node_before, |_, _| TransitionType::Char(*char)),
|
|
Regex::Sequence(terms) => {
|
|
if let Some(first) = terms.first() {
|
|
let trans_to_first = self.compile(first, 0);
|
|
} else {
|
|
TransitionType::Always;
|
|
};
|
|
todo!()
|
|
}
|
|
Regex::Primitive(primitive) => self.allocating(node_before, |_, _| {
|
|
TransitionType::Primitive(match primitive {
|
|
parse::Primitive::Word => Primitive::Word,
|
|
parse::Primitive::Digit => Primitive::Digit,
|
|
})
|
|
}),
|
|
Regex::Choice(a, b) => {
|
|
todo!()
|
|
}
|
|
Regex::Repetition(_) => {
|
|
todo!()
|
|
}
|
|
Regex::Set(_) => {
|
|
todo!()
|
|
}
|
|
Regex::Range(_) => {
|
|
todo!()
|
|
}
|
|
}
|
|
}
|
|
|
|
fn allocating<F: FnOnce(&mut Node, NodeIndex) -> TransitionType>(
|
|
&mut self,
|
|
node_before: NodeIndex,
|
|
f: F,
|
|
) -> NodeIndex {
|
|
let next_node_slot = self.reserve_node_slot();
|
|
let mut next_node = Node::default();
|
|
let this_condition = f(&mut next_node, next_node_slot);
|
|
// fill the placeholder with the node we just created, forget the placeholder
|
|
let _ = std::mem::replace(self.nodes.get_mut(next_node_slot).unwrap(), next_node);
|
|
self.nodes
|
|
.get_mut(node_before)
|
|
.unwrap()
|
|
.transitions
|
|
.push(Transition {
|
|
target_node: next_node_slot,
|
|
condition: this_condition,
|
|
});
|
|
|
|
next_node_slot
|
|
}
|
|
}
|
|
|
|
use crate::parse;
|
|
use crate::parse::Regex;
|
|
use std::ops::Range;
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
struct Transition {
|
|
target_node: usize,
|
|
condition: TransitionType,
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
enum TransitionType {
|
|
Range(Range<char>),
|
|
Primitive(Primitive),
|
|
Char(char),
|
|
Always,
|
|
}
|
|
|
|
#[derive(Debug, Copy, Clone, PartialEq, Eq)]
|
|
enum Primitive {
|
|
Word,
|
|
Digit,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Default, PartialEq, Eq)]
|
|
struct Node {
|
|
end: bool,
|
|
transitions: Vec<Transition>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, PartialEq, Eq)]
|
|
struct RegexFsm {
|
|
nodes: Vec<Node>,
|
|
}
|
|
|
|
#[derive(Debug, Default)]
|
|
struct Compiler {
|
|
nodes: Vec<Node>,
|
|
}
|
|
|
|
/// Compiles the parsed Regex into a FSM
|
|
fn compile(regex: &Regex) -> RegexFsm {
|
|
let mut compiler = Compiler::default();
|
|
|
|
// reserve the start node
|
|
compiler.reserve_node_slot();
|
|
|
|
compiler.compile(regex, 0);
|
|
|
|
RegexFsm {
|
|
nodes: compiler.nodes,
|
|
}
|
|
}
|
|
|
|
impl Compiler {
|
|
/// Pushes a placeholder node into the internal buffer and returns it's index
|
|
fn reserve_node_slot(&mut self) -> NodeIndex {
|
|
self.nodes.push(Node::default());
|
|
self.nodes.len() - 1
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod test {
|
|
use crate::compile::{Node, RegexFsm, Transition, TransitionType};
|
|
use crate::parse::Regex;
|
|
|
|
///
|
|
/// regex: /🌈/
|
|
/// fsm: () --🌈-- (())
|
|
#[test]
|
|
fn single_char() {
|
|
let ast = Regex::Char('🌈');
|
|
let fsm = super::compile(&ast);
|
|
assert_eq!(
|
|
fsm,
|
|
RegexFsm {
|
|
nodes: vec![
|
|
Node {
|
|
end: false,
|
|
transitions: vec![Transition {
|
|
target_node: 1,
|
|
condition: TransitionType::Char('🌈')
|
|
}]
|
|
},
|
|
Node {
|
|
end: true,
|
|
transitions: vec![]
|
|
}
|
|
]
|
|
}
|
|
)
|
|
}
|
|
}
|