Rust の html5ever ライブラリで HTML をパースする。Servo で使われているものと思われる。メモ。
https://github.com/servo/html5ever
High-performance browser-grade HTML5 parser

Cargo.toml
[package] name = "htmlpaser-test" version = "0.1.0" edition = "2024" [dependencies] html5ever = "0.27.0" markup5ever_rcdom = "0.3.0"
src/main.rs
use html5ever::driver::parse_document;
use html5ever::tendril::TendrilSink;
use markup5ever_rcdom::{RcDom, Handle};
use std::default::Default;
// HTML ドキュメントをパースする関数
fn parse_html(html: &str) -> RcDom {
parse_document(RcDom::default(), Default::default()).one(html)
}
// DOM を再帰的に表示する関数
fn print_dom(handle: &Handle, depth: usize) {
let indent = " ".repeat(depth);
let node = handle;
match node.data {
markup5ever_rcdom::NodeData::Document => {
println!("{}Document", indent);
}
markup5ever_rcdom::NodeData::Element { ref name, ref attrs, .. } => {
println!("{}Element: {}", indent, name.local);
for attr in attrs.borrow().iter() {
println!("{} Attribute: {}=\"{}\"", indent, attr.name.local, attr.value);
}
}
markup5ever_rcdom::NodeData::Text { ref contents } => {
println!("{}Text: {}", indent, contents.borrow());
}
_ => {}
}
for child in node.children.borrow().iter() {
print_dom(child, depth + 1);
}
}
fn main() {
let html = "<!DOCTYPE html><html><head><title>Test</title></head><body><h1 th:loop='test'>Hello, world!</h1></body></html>";
let dom = parse_html(html);
// DOM を再帰的に表示
print_dom(&dom.document, 0);
}
結果:
$ cargo run
Compiling htmlpaser-test v0.1.0 (/Users/hk2a/devel/rust/htmlpaser-test)
Finished `dev` profile [unoptimized + debuginfo] target(s) in 0.98s
Running `target/debug/htmlpaser-test`
Document
Element: html
Element: head
Element: title
Text: Test
Element: body
Element: h1
Attribute: th:loop="test"
Text: Hello, world!