ruma_html/
html.rs

1use std::{
2    cell::RefCell,
3    collections::BTreeSet,
4    fmt, io,
5    iter::FusedIterator,
6    rc::{Rc, Weak},
7};
8
9use as_variant::as_variant;
10use html5ever::{
11    local_name, namespace_url, ns, parse_fragment,
12    serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope},
13    tendril::{StrTendril, TendrilSink},
14    tree_builder::{NodeOrText, TreeSink},
15    Attribute, LocalName, ParseOpts, QualName,
16};
17use tracing::debug;
18
19#[cfg(feature = "matrix")]
20pub mod matrix;
21
22use crate::SanitizerConfig;
23
24/// An HTML fragment.
25///
26/// To get the serialized HTML, use its `Display` implementation. Due to the fact that the HTML is
27/// parsed, note that malformed HTML and comments will be stripped from the output.
28#[derive(Debug)]
29pub struct Html {
30    document: NodeRef,
31}
32
33impl Html {
34    /// Construct a new `Html` by parsing the given string.
35    ///
36    /// This is infallible, any error encountered while parsing the HTML is logged with
37    /// `tracing::debug!`.
38    pub fn parse(string: &str) -> Self {
39        let sink = Self::default();
40        let mut parser = parse_fragment(
41            sink,
42            ParseOpts::default(),
43            QualName::new(None, ns!(html), local_name!("div")),
44            Vec::new(),
45        );
46        parser.process(string.into());
47        parser.finish()
48    }
49
50    /// Sanitize this HTML according to the Matrix specification.
51    ///
52    /// This is equivalent to calling [`Self::sanitize_with()`] with a `config` value of
53    /// `SanitizerConfig::compat().remove_reply_fallback()`.
54    pub fn sanitize(&self) {
55        let config = SanitizerConfig::compat().remove_reply_fallback();
56        self.sanitize_with(&config);
57    }
58
59    /// Sanitize this HTML according to the given configuration.
60    pub fn sanitize_with(&self, config: &SanitizerConfig) {
61        config.clean(self);
62    }
63
64    /// Get the root node of the HTML.
65    fn root(&self) -> NodeRef {
66        self.document.first_child().expect("html should always have a root node")
67    }
68
69    /// Whether the root node of the HTML has children.
70    pub fn has_children(&self) -> bool {
71        self.root().has_children()
72    }
73
74    /// The first child node of the root node of the HTML.
75    ///
76    /// Returns `None` if the root node has no children.
77    pub fn first_child(&self) -> Option<NodeRef> {
78        self.root().first_child()
79    }
80
81    /// The last child node of the root node of the HTML .
82    ///
83    /// Returns `None` if the root node has no children.
84    pub fn last_child(&self) -> Option<NodeRef> {
85        self.root().last_child()
86    }
87
88    /// Iterate through the children of the root node of the HTML.
89    pub fn children(&self) -> Children {
90        Children::new(self.first_child())
91    }
92}
93
94impl Default for Html {
95    fn default() -> Self {
96        Self { document: NodeRef::new(NodeData::Document) }
97    }
98}
99
100impl TreeSink for Html {
101    type Handle = NodeRef;
102    type Output = Self;
103    type ElemName<'a> = html5ever::ExpandedName<'a>;
104
105    fn finish(self) -> Self::Output {
106        self
107    }
108
109    fn parse_error(&self, msg: std::borrow::Cow<'static, str>) {
110        debug!("HTML parse error: {msg}");
111    }
112
113    fn get_document(&self) -> Self::Handle {
114        self.document.clone()
115    }
116
117    fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> html5ever::ExpandedName<'a> {
118        target.as_element().expect("not an element").name.expanded()
119    }
120
121    fn create_element(
122        &self,
123        name: QualName,
124        attrs: Vec<Attribute>,
125        _flags: html5ever::tree_builder::ElementFlags,
126    ) -> Self::Handle {
127        NodeRef::new(NodeData::Element(ElementData {
128            name,
129            attrs: RefCell::new(attrs.into_iter().collect()),
130        }))
131    }
132
133    fn create_comment(&self, _text: StrTendril) -> Self::Handle {
134        NodeRef::new(NodeData::Other)
135    }
136
137    fn create_pi(&self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
138        NodeRef::new(NodeData::Other)
139    }
140
141    fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
142        match child {
143            NodeOrText::AppendNode(node) => parent.append_child(node),
144            NodeOrText::AppendText(text) => {
145                // If the previous sibling is also text, add this text to it.
146                if let Some(prev_text) =
147                    parent.last_child().as_ref().and_then(|sibling| sibling.as_text())
148                {
149                    prev_text.borrow_mut().push_tendril(&text);
150                } else {
151                    let node = NodeRef::new(NodeData::Text(text.into()));
152                    parent.append_child(node);
153                }
154            }
155        }
156    }
157
158    fn append_based_on_parent_node(
159        &self,
160        element: &Self::Handle,
161        prev_element: &Self::Handle,
162        child: NodeOrText<Self::Handle>,
163    ) {
164        if element.0.parent.borrow().is_some() {
165            self.append_before_sibling(element, child);
166        } else {
167            self.append(prev_element, child);
168        }
169    }
170
171    fn append_doctype_to_document(
172        &self,
173        _name: StrTendril,
174        _public_id: StrTendril,
175        _system_id: StrTendril,
176    ) {
177    }
178
179    fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
180        target.clone()
181    }
182
183    fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
184        Rc::ptr_eq(&x.0, &y.0)
185    }
186
187    fn set_quirks_mode(&self, _mode: html5ever::tree_builder::QuirksMode) {}
188
189    fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
190        match new_node {
191            NodeOrText::AppendNode(node) => node.insert_before_sibling(sibling),
192            NodeOrText::AppendText(text) => {
193                // If the previous sibling is also text, add this text to it.
194                if let Some(prev_text) =
195                    sibling.prev_sibling().as_ref().and_then(|prev_sibling| prev_sibling.as_text())
196                {
197                    prev_text.borrow_mut().push_tendril(&text);
198                } else {
199                    let node = NodeRef::new(NodeData::Text(text.into()));
200                    node.insert_before_sibling(sibling);
201                }
202            }
203        }
204    }
205
206    fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
207        let element = target.as_element().unwrap();
208        element.attrs.borrow_mut().extend(attrs);
209    }
210
211    fn remove_from_parent(&self, target: &Self::Handle) {
212        target.detach();
213    }
214
215    fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
216        for child in node.0.children.take() {
217            child.0.parent.take();
218            new_parent.append_child(child);
219        }
220    }
221}
222
223impl Serialize for Html {
224    fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
225    where
226        S: Serializer,
227    {
228        match traversal_scope {
229            TraversalScope::IncludeNode => {
230                for child in self.children() {
231                    child.serialize(serializer)?;
232                }
233
234                Ok(())
235            }
236            TraversalScope::ChildrenOnly(_) => Ok(()),
237        }
238    }
239}
240
241impl fmt::Display for Html {
242    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
243        let mut u8_vec = Vec::new();
244        serialize(
245            &mut u8_vec,
246            self,
247            SerializeOpts { traversal_scope: TraversalScope::IncludeNode, ..Default::default() },
248        )
249        .unwrap();
250
251        f.write_str(&String::from_utf8(u8_vec).unwrap())?;
252
253        Ok(())
254    }
255}
256
257/// An HTML node.
258#[derive(Debug)]
259#[non_exhaustive]
260struct Node {
261    parent: RefCell<Option<Weak<Node>>>,
262    children: RefCell<Vec<NodeRef>>,
263    data: NodeData,
264}
265
266impl Node {
267    /// Constructs a new `NodeRef` with the given data.
268    fn new(data: NodeData) -> Self {
269        Self { parent: Default::default(), children: Default::default(), data }
270    }
271
272    /// Returns the data of this `Node` if it is an Element (aka an HTML tag).
273    fn as_element(&self) -> Option<&ElementData> {
274        as_variant!(&self.data, NodeData::Element)
275    }
276
277    /// Returns the text content of this `Node`, if it is a `NodeData::Text`.
278    fn as_text(&self) -> Option<&RefCell<StrTendril>> {
279        as_variant!(&self.data, NodeData::Text)
280    }
281
282    /// Whether this is the root node of the HTML document.
283    fn is_root(&self) -> bool {
284        // The root node is the `html` element.
285        matches!(&self.data, NodeData::Element(element_data) if element_data.name.local.as_bytes() == b"html")
286    }
287
288    /// The parent of this node, if any.
289    fn parent(&self) -> Option<NodeRef> {
290        self.parent.borrow().as_ref()?.upgrade().map(NodeRef)
291    }
292}
293
294/// The data of a `Node`.
295#[derive(Debug, Clone)]
296#[allow(clippy::exhaustive_enums)]
297pub enum NodeData {
298    /// The root node of the `Html`.
299    Document,
300
301    /// A text node.
302    Text(RefCell<StrTendril>),
303
304    /// An HTML element (aka a tag).
305    Element(ElementData),
306
307    /// Other types (comment, processing instruction, …).
308    Other,
309}
310
311/// The data of an HTML element.
312#[derive(Debug, Clone)]
313#[allow(clippy::exhaustive_structs)]
314pub struct ElementData {
315    /// The qualified name of the element.
316    pub name: QualName,
317
318    /// The attributes of the element.
319    pub attrs: RefCell<BTreeSet<Attribute>>,
320}
321
322impl ElementData {
323    /// Convert this element data to typed data as [suggested by the Matrix Specification][spec].
324    ///
325    /// [spec]: https://spec.matrix.org/latest/client-server-api/#mroommessage-msgtypes
326    #[cfg(feature = "matrix")]
327    pub fn to_matrix(&self) -> matrix::MatrixElementData {
328        matrix::MatrixElementData::parse(&self.name, &self.attrs.borrow())
329    }
330}
331
332/// A reference to an HTML node.
333#[derive(Debug, Clone)]
334#[non_exhaustive]
335pub struct NodeRef(Rc<Node>);
336
337impl NodeRef {
338    /// Constructs a new `NodeRef` with the given data.
339    fn new(data: NodeData) -> Self {
340        Self(Node::new(data).into())
341    }
342
343    /// Detach this node from the tree, if it has a parent.
344    pub(crate) fn detach(&self) {
345        if let Some((parent, index)) = self.parent_and_index() {
346            parent.0.children.borrow_mut().remove(index);
347            self.0.parent.take();
348        }
349    }
350
351    /// Append the given child node to this node.
352    ///
353    /// The child node is detached from its previous position.
354    fn append_child(&self, child: NodeRef) {
355        child.detach();
356
357        child.0.parent.replace(Some(Rc::downgrade(&self.0)));
358        self.0.children.borrow_mut().push(child);
359    }
360
361    /// If this node has a parent, get it and the node's position in the parent's children.
362    fn parent_and_index(&self) -> Option<(NodeRef, usize)> {
363        let parent = self.0.parent()?;
364        let i = parent
365            .0
366            .children
367            .borrow()
368            .iter()
369            .position(|child| Rc::ptr_eq(&child.0, &self.0))
370            .expect("child should be in parent's children");
371        Some((parent, i))
372    }
373
374    /// Insert this node before the given sibling.
375    ///
376    /// This node is detached from its previous position.
377    pub(crate) fn insert_before_sibling(&self, sibling: &NodeRef) {
378        self.detach();
379
380        let (parent, index) = sibling.parent_and_index().expect("sibling should have parent");
381
382        self.0.parent.replace(Some(Rc::downgrade(&parent.0)));
383        parent.0.children.borrow_mut().insert(index, self.clone());
384    }
385
386    /// Constructs a new element `NodeRef` with the same data as this one, but with a different
387    /// element name and use it to replace this one in the parent.
388    ///
389    /// Panics if this node is not in the tree and is not an element node.
390    pub(crate) fn replace_with_element_name(self, name: LocalName) -> NodeRef {
391        let mut element_data = self.as_element().unwrap().clone();
392        element_data.name.local = name;
393
394        let new_node = NodeRef::new(NodeData::Element(element_data));
395
396        for child in self.children() {
397            new_node.append_child(child);
398        }
399
400        new_node.insert_before_sibling(&self);
401        self.detach();
402
403        new_node
404    }
405
406    /// The data of the node.
407    pub fn data(&self) -> &NodeData {
408        &self.0.data
409    }
410
411    /// Returns the data of this `Node` if it is an Element (aka an HTML tag).
412    pub fn as_element(&self) -> Option<&ElementData> {
413        self.0.as_element()
414    }
415
416    /// Returns the text content of this `Node`, if it is a `NodeData::Text`.
417    pub fn as_text(&self) -> Option<&RefCell<StrTendril>> {
418        self.0.as_text()
419    }
420
421    /// The parent node of this node.
422    ///
423    /// Returns `None` if the parent is the root node.
424    pub fn parent(&self) -> Option<NodeRef> {
425        let parent = self.0.parent()?;
426
427        // We don't want users to be able to navigate to the root.
428        if parent.0.is_root() {
429            return None;
430        }
431
432        Some(parent)
433    }
434
435    /// The next sibling node of this node.
436    ///
437    /// Returns `None` if this is the last of its siblings.
438    pub fn next_sibling(&self) -> Option<NodeRef> {
439        let (parent, index) = self.parent_and_index()?;
440        let index = index.checked_add(1)?;
441        let sibling = parent.0.children.borrow().get(index).cloned();
442        sibling
443    }
444
445    /// The previous sibling node of this node.
446    ///
447    /// Returns `None` if this is the first of its siblings.
448    pub fn prev_sibling(&self) -> Option<NodeRef> {
449        let (parent, index) = self.parent_and_index()?;
450        let index = index.checked_sub(1)?;
451        let sibling = parent.0.children.borrow().get(index).cloned();
452        sibling
453    }
454
455    /// Whether this node has children.
456    pub fn has_children(&self) -> bool {
457        !self.0.children.borrow().is_empty()
458    }
459
460    /// The first child node of this node.
461    ///
462    /// Returns `None` if this node has no children.
463    pub fn first_child(&self) -> Option<NodeRef> {
464        self.0.children.borrow().first().cloned()
465    }
466
467    /// The last child node of this node.
468    ///
469    /// Returns `None` if this node has no children.
470    pub fn last_child(&self) -> Option<NodeRef> {
471        self.0.children.borrow().last().cloned()
472    }
473
474    /// Get an iterator through the children of this node.
475    pub fn children(&self) -> Children {
476        Children::new(self.first_child())
477    }
478
479    pub(crate) fn serialize<S>(&self, serializer: &mut S) -> io::Result<()>
480    where
481        S: Serializer,
482    {
483        match self.data() {
484            NodeData::Element(data) => {
485                serializer.start_elem(
486                    data.name.clone(),
487                    data.attrs.borrow().iter().map(|attr| (&attr.name, &*attr.value)),
488                )?;
489
490                for child in self.children() {
491                    child.serialize(serializer)?;
492                }
493
494                serializer.end_elem(data.name.clone())?;
495
496                Ok(())
497            }
498            NodeData::Document => {
499                for child in self.children() {
500                    child.serialize(serializer)?;
501                }
502
503                Ok(())
504            }
505            NodeData::Text(text) => serializer.write_text(&text.borrow()),
506            _ => Ok(()),
507        }
508    }
509}
510
511/// An iterator through the children of a node.
512///
513/// Can be constructed with [`Html::children()`] or [`NodeRef::children()`].
514#[derive(Debug, Clone)]
515pub struct Children {
516    next: Option<NodeRef>,
517}
518
519impl Children {
520    /// Construct a `Children` starting from the given node.
521    fn new(start_node: Option<NodeRef>) -> Self {
522        Self { next: start_node }
523    }
524}
525
526impl Iterator for Children {
527    type Item = NodeRef;
528
529    fn next(&mut self) -> Option<Self::Item> {
530        let next = self.next.take()?;
531        self.next = next.next_sibling();
532        Some(next)
533    }
534}
535
536impl FusedIterator for Children {}
537
538#[cfg(test)]
539mod tests {
540    use super::Html;
541
542    #[test]
543    fn sanity() {
544        let html = "\
545            <h1>Title</h1>\
546            <div>\
547                <p>This is some <em>text</em></p>\
548            </div>\
549        ";
550        assert_eq!(Html::parse(html).to_string(), html);
551
552        assert_eq!(Html::parse("").to_string(), "");
553    }
554}