commit 00a411a728b13a6114881546efa1fa34ef3b9f16 Author: Noratrieb <48135649+Noratrieb@users.noreply.github.com> Date: Sun Aug 31 16:28:58 2025 +0200 init diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9f58a8a --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/target +*.gz +*.dot +*.svg diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..e834973 --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "zwergli" +version = "0.1.0" diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..d654e0e --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,6 @@ +[package] +name = "zwergli" +version = "0.1.0" +edition = "2024" + +[dependencies] diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..00e524c --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,262 @@ +struct Bitstream<'a> { + data: &'a [u8], + pos_bit: usize, +} + +impl Bitstream<'_> { + fn read_bits_normal(&mut self, mut len: usize) -> Option { + assert!((len as u32) < u64::BITS); + + let mut result = 0; + + while len > 0 { + if self.data.is_empty() { + return None; + } + + let to_read_from_current_byte = std::cmp::min(len % 8, 8 - self.pos_bit); + result <<= to_read_from_current_byte; + result |= + ((self.data[0] >> self.pos_bit) & ((1 << to_read_from_current_byte) - 1)) as u64; + len -= to_read_from_current_byte; + self.pos_bit += to_read_from_current_byte; + + if self.pos_bit == 8 { + self.data = &self.data[1..]; + self.pos_bit = 0; + } + } + + Some(result) + } +} + +struct HuffmanTree { + nodes: Vec, +} + +#[derive(Debug)] +enum HuffmanNode { + Leaf(u16), + Cont { zero: usize, one: usize }, +} + +enum HuffmanLookupResult { + Done(u16), + Incomplete { next_state: usize }, +} + +impl HuffmanTree { + fn fixed_code_lengths() -> [u8; 288] { + let mut codes = [0; 288]; + codes[0..144].fill(8); + codes[144..256].fill(9); + codes[256..280].fill(7); + codes[280..].fill(8); + codes + } + + fn from_lengths(lengths: &[u8]) -> Self { + const MAX_BITS: usize = 10; + + let mut codes = vec![0; lengths.len()]; + + let mut bl_count = [0; MAX_BITS]; + for l in lengths { + bl_count[*l as usize] += 1; + } + + let mut next_code = [0; MAX_BITS]; + let mut code = 0; + for bits in 1..MAX_BITS { + code = (code + bl_count[bits - 1]) << 1; + next_code[bits] = code; + } + + for n in 0..lengths.len() { + let len = lengths[n] as usize; + if len != 0 { + codes[n] = next_code[len]; + next_code[len] += 1; + } + } + + let mut nodes = vec![HuffmanNode::Cont { zero: 0, one: 0 }]; + for (i, &l) in lengths.iter().enumerate().filter(|(_, l)| **l != 0) { + let code = codes[i]; + let mut parent_node_idx = 0; + for j in 0..l { + let bit = (code >> (l - 1 - j)) & 0b1; + let next_node_idx = nodes.len(); + let HuffmanNode::Cont { zero, one } = &mut nodes[parent_node_idx] else { + unreachable!() + }; + let this_ref = if bit == 0 { zero } else { one }; + // are we done? + if j == l - 1 { + *this_ref = next_node_idx; + nodes.push(HuffmanNode::Leaf(i as u16)); + } else { + // has this parent node been inserted already? + if *this_ref > 0 { + // just walk + parent_node_idx = *this_ref; + } else { + // insert and walk + *this_ref = next_node_idx; + nodes.push(HuffmanNode::Cont { zero: 0, one: 0 }); + parent_node_idx = next_node_idx; + } + } + } + } + + Self { nodes } + } + + fn lookup_with_state(&self, state: usize, bit: u64) -> HuffmanLookupResult { + let HuffmanNode::Cont { zero, one } = self.nodes[state] else { + unreachable!("invalid state, should point at continuation node"); + }; + let next_state = if bit == 0 { zero } else { one }; + match self.nodes[next_state] { + HuffmanNode::Leaf(leaf) => HuffmanLookupResult::Done(leaf), + HuffmanNode::Cont { .. } => HuffmanLookupResult::Incomplete { next_state }, + } + } + + fn to_dot(&self) -> String { + use std::fmt::Write; + + let mut out = String::new(); + let mut inner = |tree: &Self| { + writeln!(out, "digraph huffman_tree {{")?; + + let root = &tree.nodes[0]; + + fn print( + tree: &HuffmanTree, + out: &mut String, + parent: Option<&str>, + choice: &str, + this_node: &str, + node: &HuffmanNode, + ) -> std::fmt::Result { + match node { + HuffmanNode::Leaf(number) => { + writeln!( + out, + "{parent} -> {number} [label={choice}]", + parent = parent.unwrap(), + )?; + } + HuffmanNode::Cont { zero, one } => { + if let Some(parent) = parent { + writeln!(out, "{parent} -> {this_node} [label={choice}]")?; + } + print( + tree, + out, + Some(this_node), + "0", + &format!("{this_node}0"), + &tree.nodes[*zero], + )?; + print( + tree, + out, + Some(this_node), + "1", + &format!("{this_node}1"), + &tree.nodes[*one], + )?; + } + } + Ok(()) + } + + print(tree, &mut out, None, "", "_", root)?; + + writeln!(out, "}}") + }; + inner(self).unwrap(); + out + } +} + +// https://datatracker.ietf.org/doc/html/rfc1951 +pub fn inflate(data: &[u8], out: &mut Vec) { + std::fs::write( + "output.dot", + HuffmanTree::from_lengths(&HuffmanTree::fixed_code_lengths()).to_dot(), + ) + .unwrap(); + + let mut data = Bitstream { data, pos_bit: 0 }; + + loop { + let bfinal = data.read_bits_normal(1).unwrap(); + + let btype = data.read_bits_normal(2).unwrap(); + + assert_eq!(btype, 1, "not a static huffman tree construction"); + + let tree = HuffmanTree::from_lengths(&HuffmanTree::fixed_code_lengths()); + + let mut node_state = 0; + loop { + let bit = data.read_bits_normal(1).unwrap(); + let result = tree.lookup_with_state(node_state, bit); + match result { + HuffmanLookupResult::Done(value) => { + dbg!(value); + node_state = 0; + + match value { + 0..256 => { + out.push(value as u8); + } + 256 => break, + 257..286 => { + let length = match value { + 257..265 => value - (257 - 3), + _ => todo!("lz77 more"), + }; + } + 286.. => unreachable!("invalid byte"), + } + } + HuffmanLookupResult::Incomplete { next_state } => node_state = next_state, + } + } + + if bfinal == 1 { + break; + } + } +} + +#[cfg(test)] +mod tests { + use crate::HuffmanTree; + + #[test] + fn bitstream() { + let bytes = [0b110_010_01_u8, 0b010_111_01]; + let mut stream = super::Bitstream { + data: &bytes, + pos_bit: 0, + }; + assert_eq!(stream.read_bits_normal(2).unwrap(), 0b01); + assert_eq!(stream.read_bits_normal(3).unwrap(), 0b010); + assert_eq!(stream.read_bits_normal(5).unwrap(), 0b11001); + assert_eq!(stream.read_bits_normal(3).unwrap(), 0b111); + assert_eq!(stream.read_bits_normal(3).unwrap(), 0b010); + } + + #[test] + fn decode() { + let lengths = HuffmanTree::fixed_code_lengths(); + HuffmanTree::from_lengths(&lengths); + } +} diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 0000000..de35c24 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,32 @@ +use std::ffi::CStr; + +fn main() { + let gz = std::env::args().nth(1).unwrap(); + let gz = std::fs::read(gz).unwrap(); + + assert_eq!(gz[0], 31, "ID"); + assert_eq!(gz[1], 139, "ID"); + assert_eq!(gz[2], 8, "compression method"); + + let flg = gz[3]; + + assert!(flg == 8 || flg == 0); // only FLG.FNAME + + let mut data_start = 10; + + if flg & 0b1000 != 0 { + let fname = CStr::from_bytes_until_nul(&gz[10..]).unwrap(); + dbg!(fname); + data_start += fname.count_bytes() + 1; + } + + let blocks = &gz[(data_start)..]; + let blocks = &blocks[..(blocks.len() - 8)]; // crc32 and isize + + let mut out = Vec::new(); + + zwergli::inflate(blocks, &mut out); + + dbg!(&out); + dbg!(String::from_utf8(out)).ok(); +}