ruma_html/
html.rs

1use std::{
2    cell::RefCell,
3    collections::BTreeSet,
4    fmt, io,
5    iter::FusedIterator,
6    rc::{Rc, Weak},
7};
8
9use as_variant::as_variant;
10use html5ever::{
11    local_name, ns, parse_fragment,
12    serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope},
13    tendril::{StrTendril, TendrilSink},
14    tree_builder::{NodeOrText, TreeSink},
15    Attribute, LocalName, ParseOpts, QualName,
16};
17use tracing::debug;
18
19#[cfg(feature = "matrix")]
20pub mod matrix;
21
22use crate::SanitizerConfig;
23
24/// An HTML fragment.
25///
26/// To get the serialized HTML, use its `Display` implementation. Due to the fact that the HTML is
27/// parsed, note that malformed HTML and comments will be stripped from the output.
28#[derive(Debug)]
29pub struct Html {
30    document: NodeRef,
31}
32
33impl Html {
34    /// Construct a new `Html` by parsing the given string.
35    ///
36    /// This is infallible, any error encountered while parsing the HTML is logged with
37    /// `tracing::debug!`.
38    pub fn parse(string: &str) -> Self {
39        let sink = Self::default();
40        let mut parser = parse_fragment(
41            sink,
42            ParseOpts::default(),
43            QualName::new(None, ns!(html), local_name!("div")),
44            Vec::new(),
45            true,
46        );
47        parser.process(string.into());
48        parser.finish()
49    }
50
51    /// Sanitize this HTML according to the Matrix specification.
52    ///
53    /// This is equivalent to calling [`Self::sanitize_with()`] with a `config` value of
54    /// `SanitizerConfig::compat().remove_reply_fallback()`.
55    pub fn sanitize(&self) {
56        let config = SanitizerConfig::compat().remove_reply_fallback();
57        self.sanitize_with(&config);
58    }
59
60    /// Sanitize this HTML according to the given configuration.
61    pub fn sanitize_with(&self, config: &SanitizerConfig) {
62        config.clean(self);
63    }
64
65    /// Get the root node of the HTML.
66    fn root(&self) -> NodeRef {
67        self.document.first_child().expect("html should always have a root node")
68    }
69
70    /// Whether the root node of the HTML has children.
71    pub fn has_children(&self) -> bool {
72        self.root().has_children()
73    }
74
75    /// The first child node of the root node of the HTML.
76    ///
77    /// Returns `None` if the root node has no children.
78    pub fn first_child(&self) -> Option<NodeRef> {
79        self.root().first_child()
80    }
81
82    /// The last child node of the root node of the HTML .
83    ///
84    /// Returns `None` if the root node has no children.
85    pub fn last_child(&self) -> Option<NodeRef> {
86        self.root().last_child()
87    }
88
89    /// Iterate through the children of the root node of the HTML.
90    pub fn children(&self) -> Children {
91        Children::new(self.first_child())
92    }
93}
94
95impl Default for Html {
96    fn default() -> Self {
97        Self { document: NodeRef::new(NodeData::Document) }
98    }
99}
100
101impl TreeSink for Html {
102    type Handle = NodeRef;
103    type Output = Self;
104    type ElemName<'a> = html5ever::ExpandedName<'a>;
105
106    fn finish(self) -> Self::Output {
107        self
108    }
109
110    fn parse_error(&self, msg: std::borrow::Cow<'static, str>) {
111        debug!("HTML parse error: {msg}");
112    }
113
114    fn get_document(&self) -> Self::Handle {
115        self.document.clone()
116    }
117
118    fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> html5ever::ExpandedName<'a> {
119        target.as_element().expect("not an element").name.expanded()
120    }
121
122    fn create_element(
123        &self,
124        name: QualName,
125        attrs: Vec<Attribute>,
126        _flags: html5ever::tree_builder::ElementFlags,
127    ) -> Self::Handle {
128        NodeRef::new(NodeData::Element(ElementData {
129            name,
130            attrs: RefCell::new(attrs.into_iter().collect()),
131        }))
132    }
133
134    fn create_comment(&self, _text: StrTendril) -> Self::Handle {
135        NodeRef::new(NodeData::Other)
136    }
137
138    fn create_pi(&self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
139        NodeRef::new(NodeData::Other)
140    }
141
142    fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
143        match child {
144            NodeOrText::AppendNode(node) => parent.append_child(node),
145            NodeOrText::AppendText(text) => {
146                // If the previous sibling is also text, add this text to it.
147                if let Some(prev_text) =
148                    parent.last_child().as_ref().and_then(|sibling| sibling.as_text())
149                {
150                    prev_text.borrow_mut().push_tendril(&text);
151                } else {
152                    let node = NodeRef::new(NodeData::Text(text.into()));
153                    parent.append_child(node);
154                }
155            }
156        }
157    }
158
159    fn append_based_on_parent_node(
160        &self,
161        element: &Self::Handle,
162        prev_element: &Self::Handle,
163        child: NodeOrText<Self::Handle>,
164    ) {
165        if element.0.parent.borrow().is_some() {
166            self.append_before_sibling(element, child);
167        } else {
168            self.append(prev_element, child);
169        }
170    }
171
172    fn append_doctype_to_document(
173        &self,
174        _name: StrTendril,
175        _public_id: StrTendril,
176        _system_id: StrTendril,
177    ) {
178    }
179
180    fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
181        target.clone()
182    }
183
184    fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
185        Rc::ptr_eq(&x.0, &y.0)
186    }
187
188    fn set_quirks_mode(&self, _mode: html5ever::tree_builder::QuirksMode) {}
189
190    fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
191        match new_node {
192            NodeOrText::AppendNode(node) => node.insert_before_sibling(sibling),
193            NodeOrText::AppendText(text) => {
194                // If the previous sibling is also text, add this text to it.
195                if let Some(prev_text) =
196                    sibling.prev_sibling().as_ref().and_then(|prev_sibling| prev_sibling.as_text())
197                {
198                    prev_text.borrow_mut().push_tendril(&text);
199                } else {
200                    let node = NodeRef::new(NodeData::Text(text.into()));
201                    node.insert_before_sibling(sibling);
202                }
203            }
204        }
205    }
206
207    fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
208        let element = target.as_element().unwrap();
209        element.attrs.borrow_mut().extend(attrs);
210    }
211
212    fn remove_from_parent(&self, target: &Self::Handle) {
213        target.detach();
214    }
215
216    fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
217        for child in node.0.children.take() {
218            child.0.parent.take();
219            new_parent.append_child(child);
220        }
221    }
222}
223
224impl Serialize for Html {
225    fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
226    where
227        S: Serializer,
228    {
229        match traversal_scope {
230            TraversalScope::IncludeNode => {
231                for child in self.children() {
232                    child.serialize(serializer)?;
233                }
234
235                Ok(())
236            }
237            TraversalScope::ChildrenOnly(_) => Ok(()),
238        }
239    }
240}
241
242impl fmt::Display for Html {
243    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
244        let mut u8_vec = Vec::new();
245        serialize(
246            &mut u8_vec,
247            self,
248            SerializeOpts { traversal_scope: TraversalScope::IncludeNode, ..Default::default() },
249        )
250        .unwrap();
251
252        f.write_str(&String::from_utf8(u8_vec).unwrap())?;
253
254        Ok(())
255    }
256}
257
258/// An HTML node.
259#[derive(Debug)]
260#[non_exhaustive]
261struct Node {
262    parent: RefCell<Option<Weak<Node>>>,
263    children: RefCell<Vec<NodeRef>>,
264    data: NodeData,
265}
266
267impl Node {
268    /// Constructs a new `NodeRef` with the given data.
269    fn new(data: NodeData) -> Self {
270        Self { parent: Default::default(), children: Default::default(), data }
271    }
272
273    /// Returns the data of this `Node` if it is an Element (aka an HTML tag).
274    fn as_element(&self) -> Option<&ElementData> {
275        as_variant!(&self.data, NodeData::Element)
276    }
277
278    /// Returns the text content of this `Node`, if it is a `NodeData::Text`.
279    fn as_text(&self) -> Option<&RefCell<StrTendril>> {
280        as_variant!(&self.data, NodeData::Text)
281    }
282
283    /// Whether this is the root node of the HTML document.
284    fn is_root(&self) -> bool {
285        // The root node is the `html` element.
286        matches!(&self.data, NodeData::Element(element_data) if element_data.name.local.as_bytes() == b"html")
287    }
288
289    /// The parent of this node, if any.
290    fn parent(&self) -> Option<NodeRef> {
291        self.parent.borrow().as_ref()?.upgrade().map(NodeRef)
292    }
293}
294
295/// The data of a `Node`.
296#[derive(Debug, Clone)]
297#[allow(clippy::exhaustive_enums)]
298pub enum NodeData {
299    /// The root node of the `Html`.
300    Document,
301
302    /// A text node.
303    Text(RefCell<StrTendril>),
304
305    /// An HTML element (aka a tag).
306    Element(ElementData),
307
308    /// Other types (comment, processing instruction, …).
309    Other,
310}
311
312/// The data of an HTML element.
313#[derive(Debug, Clone)]
314#[allow(clippy::exhaustive_structs)]
315pub struct ElementData {
316    /// The qualified name of the element.
317    pub name: QualName,
318
319    /// The attributes of the element.
320    pub attrs: RefCell<BTreeSet<Attribute>>,
321}
322
323impl ElementData {
324    /// Convert this element data to typed data as [suggested by the Matrix Specification][spec].
325    ///
326    /// [spec]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
327    #[cfg(feature = "matrix")]
328    pub fn to_matrix(&self) -> matrix::MatrixElementData {
329        matrix::MatrixElementData::parse(&self.name, &self.attrs.borrow())
330    }
331}
332
333/// A reference to an HTML node.
334#[derive(Debug, Clone)]
335#[non_exhaustive]
336pub struct NodeRef(Rc<Node>);
337
338impl NodeRef {
339    /// Constructs a new `NodeRef` with the given data.
340    fn new(data: NodeData) -> Self {
341        Self(Node::new(data).into())
342    }
343
344    /// Detach this node from the tree, if it has a parent.
345    pub(crate) fn detach(&self) {
346        if let Some((parent, index)) = self.parent_and_index() {
347            parent.0.children.borrow_mut().remove(index);
348            self.0.parent.take();
349        }
350    }
351
352    /// Append the given child node to this node.
353    ///
354    /// The child node is detached from its previous position.
355    fn append_child(&self, child: NodeRef) {
356        child.detach();
357
358        child.0.parent.replace(Some(Rc::downgrade(&self.0)));
359        self.0.children.borrow_mut().push(child);
360    }
361
362    /// If this node has a parent, get it and the node's position in the parent's children.
363    fn parent_and_index(&self) -> Option<(NodeRef, usize)> {
364        let parent = self.0.parent()?;
365        let i = parent
366            .0
367            .children
368            .borrow()
369            .iter()
370            .position(|child| Rc::ptr_eq(&child.0, &self.0))
371            .expect("child should be in parent's children");
372        Some((parent, i))
373    }
374
375    /// Insert this node before the given sibling.
376    ///
377    /// This node is detached from its previous position.
378    pub(crate) fn insert_before_sibling(&self, sibling: &NodeRef) {
379        self.detach();
380
381        let (parent, index) = sibling.parent_and_index().expect("sibling should have parent");
382
383        self.0.parent.replace(Some(Rc::downgrade(&parent.0)));
384        parent.0.children.borrow_mut().insert(index, self.clone());
385    }
386
387    /// Constructs a new element `NodeRef` with the same data as this one, but with a different
388    /// element name and use it to replace this one in the parent.
389    ///
390    /// Panics if this node is not in the tree and is not an element node.
391    pub(crate) fn replace_with_element_name(self, name: LocalName) -> NodeRef {
392        let mut element_data = self.as_element().unwrap().clone();
393        element_data.name.local = name;
394
395        let new_node = NodeRef::new(NodeData::Element(element_data));
396
397        for child in self.children() {
398            new_node.append_child(child);
399        }
400
401        new_node.insert_before_sibling(&self);
402        self.detach();
403
404        new_node
405    }
406
407    /// The data of the node.
408    pub fn data(&self) -> &NodeData {
409        &self.0.data
410    }
411
412    /// Returns the data of this `Node` if it is an Element (aka an HTML tag).
413    pub fn as_element(&self) -> Option<&ElementData> {
414        self.0.as_element()
415    }
416
417    /// Returns the text content of this `Node`, if it is a `NodeData::Text`.
418    pub fn as_text(&self) -> Option<&RefCell<StrTendril>> {
419        self.0.as_text()
420    }
421
422    /// The parent node of this node.
423    ///
424    /// Returns `None` if the parent is the root node.
425    pub fn parent(&self) -> Option<NodeRef> {
426        let parent = self.0.parent()?;
427
428        // We don't want users to be able to navigate to the root.
429        if parent.0.is_root() {
430            return None;
431        }
432
433        Some(parent)
434    }
435
436    /// The next sibling node of this node.
437    ///
438    /// Returns `None` if this is the last of its siblings.
439    pub fn next_sibling(&self) -> Option<NodeRef> {
440        let (parent, index) = self.parent_and_index()?;
441        let index = index.checked_add(1)?;
442        let sibling = parent.0.children.borrow().get(index).cloned();
443        sibling
444    }
445
446    /// The previous sibling node of this node.
447    ///
448    /// Returns `None` if this is the first of its siblings.
449    pub fn prev_sibling(&self) -> Option<NodeRef> {
450        let (parent, index) = self.parent_and_index()?;
451        let index = index.checked_sub(1)?;
452        let sibling = parent.0.children.borrow().get(index).cloned();
453        sibling
454    }
455
456    /// Whether this node has children.
457    pub fn has_children(&self) -> bool {
458        !self.0.children.borrow().is_empty()
459    }
460
461    /// The first child node of this node.
462    ///
463    /// Returns `None` if this node has no children.
464    pub fn first_child(&self) -> Option<NodeRef> {
465        self.0.children.borrow().first().cloned()
466    }
467
468    /// The last child node of this node.
469    ///
470    /// Returns `None` if this node has no children.
471    pub fn last_child(&self) -> Option<NodeRef> {
472        self.0.children.borrow().last().cloned()
473    }
474
475    /// Get an iterator through the children of this node.
476    pub fn children(&self) -> Children {
477        Children::new(self.first_child())
478    }
479
480    pub(crate) fn serialize<S>(&self, serializer: &mut S) -> io::Result<()>
481    where
482        S: Serializer,
483    {
484        match self.data() {
485            NodeData::Element(data) => {
486                serializer.start_elem(
487                    data.name.clone(),
488                    data.attrs.borrow().iter().map(|attr| (&attr.name, &*attr.value)),
489                )?;
490
491                for child in self.children() {
492                    child.serialize(serializer)?;
493                }
494
495                serializer.end_elem(data.name.clone())?;
496
497                Ok(())
498            }
499            NodeData::Document => {
500                for child in self.children() {
501                    child.serialize(serializer)?;
502                }
503
504                Ok(())
505            }
506            NodeData::Text(text) => serializer.write_text(&text.borrow()),
507            _ => Ok(()),
508        }
509    }
510}
511
512/// An iterator through the children of a node.
513///
514/// Can be constructed with [`Html::children()`] or [`NodeRef::children()`].
515#[derive(Debug, Clone)]
516pub struct Children {
517    next: Option<NodeRef>,
518}
519
520impl Children {
521    /// Construct a `Children` starting from the given node.
522    fn new(start_node: Option<NodeRef>) -> Self {
523        Self { next: start_node }
524    }
525}
526
527impl Iterator for Children {
528    type Item = NodeRef;
529
530    fn next(&mut self) -> Option<Self::Item> {
531        let next = self.next.take()?;
532        self.next = next.next_sibling();
533        Some(next)
534    }
535}
536
537impl FusedIterator for Children {}
538
539#[cfg(test)]
540mod tests {
541    use super::Html;
542
543    #[test]
544    fn sanity() {
545        let html = "\
546            <h1>Title</h1>\
547            <div>\
548                <p>This is some <em>text</em></p>\
549            </div>\
550        ";
551        assert_eq!(Html::parse(html).to_string(), html);
552
553        assert_eq!(Html::parse("").to_string(), "");
554    }
555}