Skip to content

Anton293/microdom

Repository files navigation

microdom (Workspace)

Architecture Authors microdom License CI Stars

Quick Start

Install the main crate:

cargo add microdom

Basic usage:

use microdom::parse_html;

fn main() {
    let dom = parse_html(r#"<h1>Fast API <b>parallel</b> calls</h1>"#);

    let title = dom.select("//h1").unwrap().to_string();
    let bold  = dom.select("//h1/b").unwrap().to_string();

    println!("title: {}", title); // <h1>Fast API <b>parallel</b> calls</h1>
    println!("bold: {}",  bold);  // <b>parallel</b>
}

Overview

XPath selection for HTML in Rust. Alpha stage, expect breaking changes.

Warning The HTML parser attempts to follow the HTML5 specification and fix broken HTML, but it currently does so imperfectly. However, it should be entirely sufficient for most parsing needs.

Workspace Structure

Why microdom?

  • 2x faster than html5ever (see benchmarks)
  • Zero dependencies
  • XPath 3.1 support out of the box + custom function extensions
  • Custom function extensions

Benchmarks

Library HTML+CSS+JS ~10KB Clean HTML ~20KB Micro Avg speed Notes
microdom 75 MiB/s 81 MiB/s 3.1 MiB/s ~53 MiB/s Full DOM tree ¹
html5ever 26 MiB/s 23 MiB/s 7.8 MiB/s ~19 MiB/s Full DOM tree; faster on tiny docs
tl 190 MiB/s 129 MiB/s 37 MiB/s ~119 MiB/s ⚠️ Tag index only, not a DOM tree

¹ microdom does not fully follow the HTML standard — on severely malformed markup, content may leak into attribute values. However, it never panics or fails: parsing is always attempted. Sufficient for most data extraction use cases.

Code examples: [Full examples in examples/ directory]

use microdom::{parse_html, SelectResult, CustomEvaluator, XPathValue};

fn main() {
    let html = r#"
    <html>
        <body>
            <div id="answer-1" class="answer">Answer 1</div>
            <div id="answer-2" class="answer">Answer 2</div>
            <div id="answer-3" class="answer new-answer">Answer 3</div>
        </body>
    </html>
    "#;

    let mut lib_evaluator = CustomEvaluator::new();

    lib_evaluator.register_native("lib:hello", |args| {
        Ok(XPathValue::String("Hello, lib!".into()))
    });


    let mut custom_evaluator = CustomEvaluator::new();
    custom_evaluator.update(&lib_evaluator);

    custom_evaluator.register_native("userpath:hello", |args| {
        Ok(XPathValue::String(format!("Hello, World! {:?}", args[0].to_str()))) // Hello, World! "<some_argument>"
    });




    let dom = parse_html(&html);

    let xpath = r#"//*[@id="answer-*" and not(contains(@class, "new"))]"#;
    let result = custom_evaluator.select(&dom, xpath).unwrap(); // (Text, Int, List)
    //let result = dom.select(xpath).unwrap();
    // Output:
    // XPath '//*[@id="answer-*" and not(contains(@class, "new"))]' returned: [<div id="answer-1" class="answer">Answer 1</div>, <div id="answer-2" class="answer">Answer 2</div>]

    match result {
        SelectResult::List(nodes) => {
            for node in nodes {
                println!("\n\n\nNode: {}", node);
            }
        }
        SelectResult::Text(s) => {
            println!("XPath '{}' returned string: {}", xpath, s);
        }
        _ => {
            println!("XPath did not return a node set.");
        }
    }

    // Custom function usage
    let xpath = r#"lib:hello()"#;
    let result = custom_evaluator.select(&dom, xpath).unwrap();
    println!("XPath '{}' returned: {}", xpath, result);

    
}

Supported XPath

  • ✅ Node selection (//, /, [@attr])
  • ✅ Wildcards () in node names and attributes ([@id="answer-"])
  • ✅ Predicates ([position()], [@id="x"])
  • ✅ Union (|)
  • ✅ count(), string(), text()
  • ✅ contains(), starts-with(), normalize-space()
  • ✅ substring-before/after()
  • ✅ if/then/else
  • ✅ every $score in //li[@class='comment']/@data satisfies $score >= 0
  • ✅ for $i in //li return $i/a/@href
  • ✅ let $f := function($a) { $a > 5 } return filter(//li/@data, $f)
  • ✅ Logical operators (and, or, not, =, !=, <, >, <=, >=)
  • ✅ User-defined functions (e.g. me_feature:hello(), me_feature:save("data"), etc.)
  • ✅ Mathematical functions (sum, avg, max, min, median, round, floor, ceiling, abs, math:pi(), math:sqrt(), math:pow(), math:cos()....)
  • ⚠️ Develop? - Regex matching (matches(), replace())
  • ⚠️ Type returning data SelectResult:: (Text, Int, List)

Examples

basic data:

use microdom::{parse_html, SelectResult, CustomEvaluator, XPathValue};

let html = r#"
<html>
<body>
  <div class="products">
    <div class="product" data-price="120" data-stock="5" data-rating="4.8">
      <h2>Mechanical Keyboard</h2>
      <span class="tag">electronics</span>
      <span class="tag">office</span>
    </div>
    <div class="product" data-price="35" data-stock="0" data-rating="3.2">
      <h2>USB Hub</h2>
      <span class="tag">electronics</span>
    </div>
    <div class="product" data-price="899" data-stock="2" data-rating="4.9">
      <h2>Monitor 4K</h2>
      <span class="tag">electronics</span>
      <span class="tag">display</span>
    </div>
    <div class="product" data-price="19" data-stock="100" data-rating="4.1">
      <h2>Mouse Pad XL</h2>
      <span class="tag">office</span>
    </div>
  </div>
</body>
</html>
"#;

let dom = parse_html(html);

short example:

    let xpath = r#"//*[@id="answer-*" and not(contains(@class, "new"))]"#;
    let result = dom.select(xpath).unwrap();
    println!("XPath '{}' returned: {}", xpath, result);

custom function example:

let mut example_evaluator = CustomEvaluator::new();

example_evaluator.register("shop:score", |args| {
    let price  = args.get(0).map(|v| v.to_str().parse::<f64>().unwrap_or(0.0)).unwrap_or(0.0);
    let stock  = args.get(1).map(|v| v.to_str().parse::<f64>().unwrap_or(0.0)).unwrap_or(0.0);
    let rating = args.get(2).map(|v| v.to_str().parse::<f64>().unwrap_or(0.0)).unwrap_or(0.0);

    let in_stock_bonus = if stock > 0.0 { 0.0 } else { -50.0 };
    let price_penalty  = -(price / 100.0).min(10.0);
    let score = rating * 20.0 + in_stock_bonus + price_penalty;

    Ok(XPathValue::Number(score))
});

let result = example_evaluator.select(
    &dom,
    r#"//div[@class="product"][shop:score(@data-price, @data-stock, @data-rating) > 60]/h2"#,
).unwrap();


// SelectResult::List(["Mechanical Keyboard", "Monitor 4K", "Mouse Pad XL"])
println!("{result:?}");
let mut example_evaluator = CustomEvaluator::new();

// str:slug($text) → "Mechanical Keyboard" → "mechanical-keyboard"
example_evaluator.register("str:slug", |args| {
    let input = args.get(0).map(|v| v.to_str()).unwrap_or_default();

    let slug = input
        .to_lowercase()
        .chars()
        .map(|c| if c.is_alphanumeric() { c } else { '-' })
        .collect::<String>()
        .split('-')
        .filter(|s| !s.is_empty())
        .collect::<Vec<_>>()
        .join("-");

    Ok(XPathValue::String(slug))
});

let result = example_evaluator.select(
    &dom,
    r#"//div[@class="product"][@data-stock > 0]/str:slug(h2)"#,
).unwrap();

// SelectResult::List(["mechanical-keyboard", "monitor-4k", "mouse-pad-xl"])
println!("{result:?}");

Crates

  • microdom: microdom
  • coredom: coredom
  • microdom-xpath-engine microdom-xpath-engine

License

Licensed under either of

at your option.

Packages

 
 
 

Contributors

Languages