ruma_html/sanitizer_config/
clean.rs

1use html5ever::{tendril::StrTendril, Attribute, LocalName};
2use phf::{phf_map, phf_set, Map, Set};
3use wildmatch::WildMatch;
4
5use crate::{ElementData, Html, HtmlSanitizerMode, NodeData, NodeRef, SanitizerConfig};
6
7/// HTML elements allowed in the Matrix specification.
8static ALLOWED_ELEMENTS_STRICT: Set<&str> = phf_set! {
9    "del", "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "p", "a",
10    "ul", "ol", "sup", "sub", "li", "b", "i", "u", "strong", "em", "s",
11    "code", "hr", "br", "div", "table", "thead", "tbody", "tr", "th", "td",
12    "caption", "pre", "span", "img", "details", "summary", "mx-reply",
13};
14
15/// The HTML element name for a rich reply fallback.
16const RICH_REPLY_ELEMENT_NAME: &str = "mx-reply";
17
18/// HTML elements that were previously allowed in the Matrix specification, with their replacement.
19static DEPRECATED_ELEMENTS: Map<&str, &str> = phf_map! {
20    "font" => "span",
21    "strike" => "s",
22};
23
24/// Allowed attributes per HTML element according to the Matrix specification.
25static ALLOWED_ATTRIBUTES_STRICT: Map<&str, &Set<&str>> = phf_map! {
26    "span" => &ALLOWED_ATTRIBUTES_SPAN_STRICT,
27    "a" => &ALLOWED_ATTRIBUTES_A_STRICT,
28    "img" => &ALLOWED_ATTRIBUTES_IMG_STRICT,
29    "ol" => &ALLOWED_ATTRIBUTES_OL_STRICT,
30    "code" => &ALLOWED_ATTRIBUTES_CODE_STRICT,
31    "div" => &ALLOWED_ATTRIBUTES_DIV_STRICT,
32};
33static ALLOWED_ATTRIBUTES_SPAN_STRICT: Set<&str> =
34    phf_set! { "data-mx-bg-color", "data-mx-color", "data-mx-spoiler", "data-mx-maths" };
35static ALLOWED_ATTRIBUTES_A_STRICT: Set<&str> = phf_set! { "target", "href" };
36static ALLOWED_ATTRIBUTES_IMG_STRICT: Set<&str> =
37    phf_set! { "width", "height", "alt", "title", "src" };
38static ALLOWED_ATTRIBUTES_OL_STRICT: Set<&str> = phf_set! { "start" };
39static ALLOWED_ATTRIBUTES_CODE_STRICT: Set<&str> = phf_set! { "class" };
40static ALLOWED_ATTRIBUTES_DIV_STRICT: Set<&str> = phf_set! { "data-mx-maths" };
41
42/// Attributes that were previously allowed on HTML elements according to the Matrix specification,
43/// with their replacement.
44static DEPRECATED_ATTRS: Map<&str, &Map<&str, &str>> = phf_map! {
45    "font" => &DEPRECATED_ATTRIBUTES_FONT,
46};
47static DEPRECATED_ATTRIBUTES_FONT: Map<&str, &str> = phf_map! { "color" => "data-mx-color" };
48
49/// Allowed schemes of URIs per attribute per HTML element according to the Matrix specification.
50static ALLOWED_SCHEMES_STRICT: Map<&str, &Map<&str, &Set<&str>>> = phf_map! {
51    "a" => &ALLOWED_SCHEMES_A_STRICT,
52    "img" => &ALLOWED_SCHEMES_IMG_STRICT,
53};
54static ALLOWED_SCHEMES_A_STRICT: Map<&str, &Set<&str>> = phf_map! {
55    "href" => &ALLOWED_SCHEMES_A_HREF_STRICT,
56};
57pub(crate) static ALLOWED_SCHEMES_A_HREF_STRICT: Set<&str> =
58    phf_set! { "http", "https", "ftp", "mailto", "magnet" };
59static ALLOWED_SCHEMES_IMG_STRICT: Map<&str, &Set<&str>> = phf_map! {
60    "src" => &ALLOWED_SCHEMES_IMG_SRC_STRICT,
61};
62static ALLOWED_SCHEMES_IMG_SRC_STRICT: Set<&str> = phf_set! { "mxc" };
63
64/// Extra allowed schemes of URIs per attribute per HTML element.
65///
66/// This is a convenience list to add schemes that can be encountered but are not listed in the
67/// Matrix specification. It consists of:
68///
69/// * The `matrix` scheme for `a` elements (see [matrix-org/matrix-spec#1108]).
70///
71/// To get a complete list, add these to `ALLOWED_SCHEMES_STRICT`.
72///
73/// [matrix-org/matrix-spec#1108]: https://github.com/matrix-org/matrix-spec/issues/1108
74static ALLOWED_SCHEMES_COMPAT: Map<&str, &Map<&str, &Set<&str>>> = phf_map! {
75    "a" => &ALLOWED_SCHEMES_A_COMPAT,
76};
77static ALLOWED_SCHEMES_A_COMPAT: Map<&str, &Set<&str>> = phf_map! {
78    "href" => &ALLOWED_SCHEMES_A_HREF_COMPAT,
79};
80pub(crate) static ALLOWED_SCHEMES_A_HREF_COMPAT: Set<&str> = phf_set! { "matrix" };
81
82/// Allowed classes per HTML element according to the Matrix specification.
83static ALLOWED_CLASSES_STRICT: Map<&str, &Set<&str>> =
84    phf_map! { "code" => &ALLOWED_CLASSES_CODE_STRICT };
85static ALLOWED_CLASSES_CODE_STRICT: Set<&str> = phf_set! { "language-*" };
86
87/// Max depth of nested HTML elements allowed by the Matrix specification.
88const MAX_DEPTH_STRICT: u32 = 100;
89
90impl SanitizerConfig {
91    /// Whether the current mode uses the values of the strict mode.
92    fn use_strict(&self) -> bool {
93        self.mode.is_some()
94    }
95
96    /// Whether the current mode uses the values of the compat mode.
97    fn use_compat(&self) -> bool {
98        self.mode.is_some_and(|m| m == HtmlSanitizerMode::Compat)
99    }
100
101    /// The maximum nesting level allowed by the config.
102    fn max_depth_value(&self) -> Option<u32> {
103        self.max_depth.or_else(|| self.use_strict().then_some(MAX_DEPTH_STRICT))
104    }
105
106    /// Clean the given HTML with this sanitizer.
107    pub(crate) fn clean(&self, html: &Html) {
108        for child in html.children() {
109            self.clean_node(child, 0);
110        }
111    }
112
113    fn clean_node(&self, node: NodeRef, depth: u32) {
114        let node = self.apply_replacements(node);
115
116        let action = self.node_action(&node, depth);
117
118        if action != NodeAction::Remove {
119            for child in node.children() {
120                if action == NodeAction::Ignore {
121                    child.insert_before_sibling(&node);
122                }
123
124                self.clean_node(child, depth + 1);
125            }
126        }
127
128        if matches!(action, NodeAction::Ignore | NodeAction::Remove) {
129            node.detach();
130        } else if let Some(data) = node.as_element() {
131            self.clean_element_attributes(data);
132        }
133    }
134
135    /// Apply the attributes and element name replacements to the given node.
136    ///
137    /// This might return a different node than the one provided.
138    fn apply_replacements(&self, node: NodeRef) -> NodeRef {
139        let mut element_replacement = None;
140
141        if let NodeData::Element(ElementData { name, attrs, .. }) = node.data() {
142            let element_name = name.local.as_ref();
143
144            // Replace attributes.
145            let list_replacements =
146                self.replace_attrs.as_ref().and_then(|list| list.content.get(element_name));
147            let list_is_override =
148                self.replace_attrs.as_ref().map(|list| list.is_override()).unwrap_or_default();
149            let mode_replacements = (!list_is_override && self.use_strict())
150                .then(|| DEPRECATED_ATTRS.get(element_name))
151                .flatten();
152
153            if list_replacements.is_some() || mode_replacements.is_some() {
154                let mut attrs = attrs.borrow_mut();
155                *attrs = attrs
156                    .clone()
157                    .into_iter()
158                    .map(|mut attr| {
159                        let attr_name = attr.name.local.as_ref();
160
161                        let attr_replacement = list_replacements
162                            .and_then(|s| s.get(attr_name))
163                            .or_else(|| mode_replacements.and_then(|s| s.get(attr_name)))
164                            .copied();
165
166                        if let Some(attr_replacement) = attr_replacement {
167                            attr.name.local = LocalName::from(attr_replacement);
168                        }
169
170                        attr
171                    })
172                    .collect();
173            }
174
175            // Replace element.
176            element_replacement = self
177                .replace_elements
178                .as_ref()
179                .and_then(|list| list.content.get(element_name))
180                .copied();
181
182            if element_replacement.is_none() {
183                let list_is_override = self
184                    .replace_elements
185                    .as_ref()
186                    .map(|list| list.is_override())
187                    .unwrap_or_default();
188                element_replacement = (!list_is_override && self.use_strict())
189                    .then(|| DEPRECATED_ELEMENTS.get(element_name))
190                    .flatten()
191                    .copied();
192            }
193        }
194
195        if let Some(element_replacement) = element_replacement {
196            node.replace_with_element_name(LocalName::from(element_replacement))
197        } else {
198            node
199        }
200    }
201
202    fn node_action(&self, node: &NodeRef, depth: u32) -> NodeAction {
203        match node.data() {
204            NodeData::Element(ElementData { name, attrs, .. }) => {
205                let element_name = name.local.as_ref();
206                let attrs = attrs.borrow();
207
208                // Check if element should be removed.
209                if self.remove_elements.as_ref().is_some_and(|set| set.contains(element_name)) {
210                    return NodeAction::Remove;
211                }
212                if self.remove_reply_fallback && element_name == RICH_REPLY_ELEMENT_NAME {
213                    return NodeAction::Remove;
214                }
215                if self.max_depth_value().is_some_and(|max| depth >= max) {
216                    return NodeAction::Remove;
217                }
218
219                // Check if element should be ignored.
220                if self.ignore_elements.as_ref().is_some_and(|set| set.contains(element_name)) {
221                    return NodeAction::Ignore;
222                }
223
224                // Check if element should be allowed.
225                if self.allow_elements.is_some() || self.use_strict() {
226                    let list_allowed = self
227                        .allow_elements
228                        .as_ref()
229                        .is_some_and(|list| list.content.contains(element_name));
230                    let list_is_override = self
231                        .allow_elements
232                        .as_ref()
233                        .map(|list| list.is_override())
234                        .unwrap_or_default();
235                    let mode_allowed = !list_is_override
236                        && self.use_strict()
237                        && ALLOWED_ELEMENTS_STRICT.contains(element_name);
238
239                    if !list_allowed && !mode_allowed {
240                        return NodeAction::Ignore;
241                    }
242                }
243
244                // Check if element contains scheme that should be denied.
245                if let Some(deny_schemes) =
246                    self.deny_schemes.as_ref().and_then(|map| map.get(element_name))
247                {
248                    for attr in attrs.iter() {
249                        let value = &attr.value;
250                        let attr_name = attr.name.local.as_ref();
251
252                        if let Some(schemes) = deny_schemes.get(attr_name) {
253                            // Check if the scheme is denied.
254                            if schemes.iter().any(|scheme| value.starts_with(&format!("{scheme}:")))
255                            {
256                                return NodeAction::Ignore;
257                            }
258                        }
259                    }
260                }
261
262                if self.allow_schemes.is_none() && !self.use_strict() {
263                    // All schemes are allowed.
264                    return NodeAction::None;
265                }
266
267                // Check if element contains scheme that should be allowed.
268                let list_element_schemes =
269                    self.allow_schemes.as_ref().and_then(|list| list.content.get(element_name));
270                let list_is_override =
271                    self.allow_schemes.as_ref().map(|list| list.is_override()).unwrap_or_default();
272                let strict_mode_element_schemes = (!list_is_override && self.use_strict())
273                    .then(|| ALLOWED_SCHEMES_STRICT.get(element_name))
274                    .flatten();
275                let compat_mode_element_schemes = (!list_is_override && self.use_compat())
276                    .then(|| ALLOWED_SCHEMES_COMPAT.get(element_name))
277                    .flatten();
278
279                if list_element_schemes.is_none()
280                    && strict_mode_element_schemes.is_none()
281                    && compat_mode_element_schemes.is_none()
282                {
283                    // We don't check schemes for this element.
284                    return NodeAction::None;
285                }
286
287                for attr in attrs.iter() {
288                    let value = &attr.value;
289                    let attr_name = attr.name.local.as_ref();
290
291                    let list_attr_schemes = list_element_schemes.and_then(|map| map.get(attr_name));
292                    let strict_mode_attr_schemes =
293                        strict_mode_element_schemes.and_then(|map| map.get(attr_name));
294                    let compat_mode_attr_schemes =
295                        compat_mode_element_schemes.and_then(|map| map.get(attr_name));
296
297                    if list_attr_schemes.is_none()
298                        && strict_mode_attr_schemes.is_none()
299                        && compat_mode_attr_schemes.is_none()
300                    {
301                        // We don't check schemes for this attribute.
302                        return NodeAction::None;
303                    }
304
305                    let mut allowed_schemes = list_attr_schemes
306                        .into_iter()
307                        .flatten()
308                        .chain(strict_mode_attr_schemes.map(|set| set.iter()).into_iter().flatten())
309                        .chain(
310                            compat_mode_attr_schemes.map(|set| set.iter()).into_iter().flatten(),
311                        );
312
313                    // Check if the scheme is allowed.
314                    if !allowed_schemes.any(|scheme| value.starts_with(&format!("{scheme}:"))) {
315                        return NodeAction::Ignore;
316                    }
317                }
318
319                NodeAction::None
320            }
321            NodeData::Text(_) => NodeAction::None,
322            _ => NodeAction::Remove,
323        }
324    }
325
326    fn clean_element_attributes(&self, data: &ElementData) {
327        let ElementData { name, attrs } = data;
328        let element_name = name.local.as_ref();
329        let mut attrs = attrs.borrow_mut();
330
331        let list_remove_attrs = self.remove_attrs.as_ref().and_then(|map| map.get(element_name));
332
333        let whitelist_attrs = self.allow_attrs.is_some() || self.use_strict();
334        let list_allow_attrs =
335            self.allow_attrs.as_ref().and_then(|list| list.content.get(element_name));
336        let list_is_override =
337            self.allow_attrs.as_ref().map(|list| list.is_override()).unwrap_or_default();
338        let mode_allow_attrs = (!list_is_override && self.use_strict())
339            .then(|| ALLOWED_ATTRIBUTES_STRICT.get(element_name))
340            .flatten();
341
342        let list_remove_classes =
343            self.remove_classes.as_ref().and_then(|map| map.get(element_name));
344
345        let whitelist_classes = self.allow_classes.is_some() || self.use_strict();
346        let list_allow_classes =
347            self.allow_classes.as_ref().and_then(|list| list.content.get(element_name));
348        let list_is_override =
349            self.allow_classes.as_ref().map(|list| list.is_override()).unwrap_or_default();
350        let mode_allow_classes = (!list_is_override && self.use_strict())
351            .then(|| ALLOWED_CLASSES_STRICT.get(element_name))
352            .flatten();
353
354        let actions: Vec<_> = attrs
355            .iter()
356            .filter_map(|attr| {
357                let value = &attr.value;
358                let attr_name = attr.name.local.as_ref();
359
360                // Check if the attribute should be removed.
361                if list_remove_attrs.is_some_and(|set| set.contains(attr_name)) {
362                    return Some(AttributeAction::Remove(attr.to_owned()));
363                }
364
365                // Check if the attribute is allowed.
366                if whitelist_attrs {
367                    let list_allowed = list_allow_attrs.is_some_and(|set| set.contains(attr_name));
368                    let mode_allowed = mode_allow_attrs.is_some_and(|set| set.contains(attr_name));
369
370                    if !list_allowed && !mode_allowed {
371                        return Some(AttributeAction::Remove(attr.to_owned()));
372                    }
373                }
374
375                // Filter classes.
376                if attr_name == "class" {
377                    let mut classes = value.split_whitespace().collect::<Vec<_>>();
378                    let initial_len = classes.len();
379
380                    // Process classes to remove.
381                    if let Some(remove_classes) = list_remove_classes {
382                        classes.retain(|class| {
383                            for remove_class in remove_classes {
384                                if WildMatch::new(remove_class).matches(class) {
385                                    return false;
386                                }
387                            }
388
389                            true
390                        });
391                    }
392
393                    // Process classes to allow.
394                    if whitelist_classes {
395                        classes.retain(|class| {
396                            let allow_classes = list_allow_classes
397                                .map(|set| set.iter())
398                                .into_iter()
399                                .flatten()
400                                .chain(
401                                    mode_allow_classes.map(|set| set.iter()).into_iter().flatten(),
402                                );
403
404                            for allow_class in allow_classes {
405                                if WildMatch::new(allow_class).matches(class) {
406                                    return true;
407                                }
408                            }
409
410                            false
411                        });
412                    }
413
414                    if classes.len() == initial_len {
415                        // The list has not changed, no action necessary.
416                        return None;
417                    }
418
419                    if classes.is_empty() {
420                        return Some(AttributeAction::Remove(attr.to_owned()));
421                    } else {
422                        let new_class = classes.join(" ");
423                        return Some(AttributeAction::ReplaceValue(
424                            attr.to_owned(),
425                            new_class.into(),
426                        ));
427                    }
428                }
429
430                None
431            })
432            .collect();
433
434        for action in actions {
435            match action {
436                AttributeAction::ReplaceValue(attr, value) => {
437                    if let Some(mut attr) = attrs.take(&attr) {
438                        attr.value = value;
439                        attrs.insert(attr);
440                    }
441                }
442                AttributeAction::Remove(attr) => {
443                    attrs.remove(&attr);
444                }
445            }
446        }
447    }
448}
449
450/// The possible actions to apply to an element node.
451#[derive(Debug, PartialEq, Eq)]
452enum NodeAction {
453    /// Don't do anything.
454    None,
455
456    /// Remove the element but keep its children.
457    Ignore,
458
459    /// Remove the element and its children.
460    Remove,
461}
462
463/// The possible actions to apply to an attribute.
464#[derive(Debug)]
465enum AttributeAction {
466    /// Replace the value of the attribute.
467    ReplaceValue(Attribute, StrTendril),
468
469    /// Remove the attribute.
470    Remove(Attribute),
471}