Rust の html5ever ライブラリで HTML をパースする。Servo で使われているものと思われる。メモ。
https://github.com/servo/html5ever
High-performance browser-grade HTML5 parser

Cargo.toml
[package] name = "htmlpaser-test" version = "0.1.0" edition = "2024" [dependencies] html5ever = "0.27.0" markup5ever_rcdom = "0.3.0"
src/main.rs
use html5ever::driver::parse_document; use html5ever::tendril::TendrilSink; use markup5ever_rcdom::{RcDom, Handle}; use std::default::Default; // HTML ドキュメントをパースする関数 fn parse_html(html: &str) -> RcDom { parse_document(RcDom::default(), Default::default()).one(html) } // DOM を再帰的に表示する関数 fn print_dom(handle: &Handle, depth: usize) { let indent = " ".repeat(depth); let node = handle; match node.data { markup5ever_rcdom::NodeData::Document => { println!("{}Document", indent); } markup5ever_rcdom::NodeData::Element { ref name, ref attrs, .. } => { println!("{}Element: {}", indent, name.local); for attr in attrs.borrow().iter() { println!("{} Attribute: {}=\"{}\"", indent, attr.name.local, attr.value); } } markup5ever_rcdom::NodeData::Text { ref contents } => { println!("{}Text: {}", indent, contents.borrow()); } _ => {} } for child in node.children.borrow().iter() { print_dom(child, depth + 1); } } fn main() { let html = "<!DOCTYPE html><html><head><title>Test</title></head><body><h1 th:loop='test'>Hello, world!</h1></body></html>"; let dom = parse_html(html); // DOM を再帰的に表示 print_dom(&dom.document, 0); }
結果:
$ cargo run Compiling htmlpaser-test v0.1.0 (/Users/hk2a/devel/rust/htmlpaser-test) Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s Running `target/debug/htmlpaser-test` Document Element: html Element: head Element: title Text: Test Element: body Element: h1 Attribute: th:loop="test" Text: Hello, world!