1use std::{
2 cell::RefCell,
3 collections::BTreeSet,
4 fmt, io,
5 iter::FusedIterator,
6 rc::{Rc, Weak},
7};
8
9use as_variant::as_variant;
10use html5ever::{
11 local_name, namespace_url, ns, parse_fragment,
12 serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope},
13 tendril::{StrTendril, TendrilSink},
14 tree_builder::{NodeOrText, TreeSink},
15 Attribute, LocalName, ParseOpts, QualName,
16};
17use tracing::debug;
18
19#[cfg(feature = "matrix")]
20pub mod matrix;
21
22use crate::SanitizerConfig;
23
24#[derive(Debug)]
29pub struct Html {
30 document: NodeRef,
31}
32
33impl Html {
34 pub fn parse(string: &str) -> Self {
39 let sink = Self::default();
40 let mut parser = parse_fragment(
41 sink,
42 ParseOpts::default(),
43 QualName::new(None, ns!(html), local_name!("div")),
44 Vec::new(),
45 );
46 parser.process(string.into());
47 parser.finish()
48 }
49
50 pub fn sanitize(&self) {
55 let config = SanitizerConfig::compat().remove_reply_fallback();
56 self.sanitize_with(&config);
57 }
58
59 pub fn sanitize_with(&self, config: &SanitizerConfig) {
61 config.clean(self);
62 }
63
64 fn root(&self) -> NodeRef {
66 self.document.first_child().expect("html should always have a root node")
67 }
68
69 pub fn has_children(&self) -> bool {
71 self.root().has_children()
72 }
73
74 pub fn first_child(&self) -> Option<NodeRef> {
78 self.root().first_child()
79 }
80
81 pub fn last_child(&self) -> Option<NodeRef> {
85 self.root().last_child()
86 }
87
88 pub fn children(&self) -> Children {
90 Children::new(self.first_child())
91 }
92}
93
94impl Default for Html {
95 fn default() -> Self {
96 Self { document: NodeRef::new(NodeData::Document) }
97 }
98}
99
100impl TreeSink for Html {
101 type Handle = NodeRef;
102 type Output = Self;
103 type ElemName<'a> = html5ever::ExpandedName<'a>;
104
105 fn finish(self) -> Self::Output {
106 self
107 }
108
109 fn parse_error(&self, msg: std::borrow::Cow<'static, str>) {
110 debug!("HTML parse error: {msg}");
111 }
112
113 fn get_document(&self) -> Self::Handle {
114 self.document.clone()
115 }
116
117 fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> html5ever::ExpandedName<'a> {
118 target.as_element().expect("not an element").name.expanded()
119 }
120
121 fn create_element(
122 &self,
123 name: QualName,
124 attrs: Vec<Attribute>,
125 _flags: html5ever::tree_builder::ElementFlags,
126 ) -> Self::Handle {
127 NodeRef::new(NodeData::Element(ElementData {
128 name,
129 attrs: RefCell::new(attrs.into_iter().collect()),
130 }))
131 }
132
133 fn create_comment(&self, _text: StrTendril) -> Self::Handle {
134 NodeRef::new(NodeData::Other)
135 }
136
137 fn create_pi(&self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
138 NodeRef::new(NodeData::Other)
139 }
140
141 fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
142 match child {
143 NodeOrText::AppendNode(node) => parent.append_child(node),
144 NodeOrText::AppendText(text) => {
145 if let Some(prev_text) =
147 parent.last_child().as_ref().and_then(|sibling| sibling.as_text())
148 {
149 prev_text.borrow_mut().push_tendril(&text);
150 } else {
151 let node = NodeRef::new(NodeData::Text(text.into()));
152 parent.append_child(node);
153 }
154 }
155 }
156 }
157
158 fn append_based_on_parent_node(
159 &self,
160 element: &Self::Handle,
161 prev_element: &Self::Handle,
162 child: NodeOrText<Self::Handle>,
163 ) {
164 if element.0.parent.borrow().is_some() {
165 self.append_before_sibling(element, child);
166 } else {
167 self.append(prev_element, child);
168 }
169 }
170
171 fn append_doctype_to_document(
172 &self,
173 _name: StrTendril,
174 _public_id: StrTendril,
175 _system_id: StrTendril,
176 ) {
177 }
178
179 fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
180 target.clone()
181 }
182
183 fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
184 Rc::ptr_eq(&x.0, &y.0)
185 }
186
187 fn set_quirks_mode(&self, _mode: html5ever::tree_builder::QuirksMode) {}
188
189 fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
190 match new_node {
191 NodeOrText::AppendNode(node) => node.insert_before_sibling(sibling),
192 NodeOrText::AppendText(text) => {
193 if let Some(prev_text) =
195 sibling.prev_sibling().as_ref().and_then(|prev_sibling| prev_sibling.as_text())
196 {
197 prev_text.borrow_mut().push_tendril(&text);
198 } else {
199 let node = NodeRef::new(NodeData::Text(text.into()));
200 node.insert_before_sibling(sibling);
201 }
202 }
203 }
204 }
205
206 fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
207 let element = target.as_element().unwrap();
208 element.attrs.borrow_mut().extend(attrs);
209 }
210
211 fn remove_from_parent(&self, target: &Self::Handle) {
212 target.detach();
213 }
214
215 fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
216 for child in node.0.children.take() {
217 child.0.parent.take();
218 new_parent.append_child(child);
219 }
220 }
221}
222
223impl Serialize for Html {
224 fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
225 where
226 S: Serializer,
227 {
228 match traversal_scope {
229 TraversalScope::IncludeNode => {
230 for child in self.children() {
231 child.serialize(serializer)?;
232 }
233
234 Ok(())
235 }
236 TraversalScope::ChildrenOnly(_) => Ok(()),
237 }
238 }
239}
240
241impl fmt::Display for Html {
242 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
243 let mut u8_vec = Vec::new();
244 serialize(
245 &mut u8_vec,
246 self,
247 SerializeOpts { traversal_scope: TraversalScope::IncludeNode, ..Default::default() },
248 )
249 .unwrap();
250
251 f.write_str(&String::from_utf8(u8_vec).unwrap())?;
252
253 Ok(())
254 }
255}
256
257#[derive(Debug)]
259#[non_exhaustive]
260struct Node {
261 parent: RefCell<Option<Weak<Node>>>,
262 children: RefCell<Vec<NodeRef>>,
263 data: NodeData,
264}
265
266impl Node {
267 fn new(data: NodeData) -> Self {
269 Self { parent: Default::default(), children: Default::default(), data }
270 }
271
272 fn as_element(&self) -> Option<&ElementData> {
274 as_variant!(&self.data, NodeData::Element)
275 }
276
277 fn as_text(&self) -> Option<&RefCell<StrTendril>> {
279 as_variant!(&self.data, NodeData::Text)
280 }
281
282 fn is_root(&self) -> bool {
284 matches!(&self.data, NodeData::Element(element_data) if element_data.name.local.as_bytes() == b"html")
286 }
287
288 fn parent(&self) -> Option<NodeRef> {
290 self.parent.borrow().as_ref()?.upgrade().map(NodeRef)
291 }
292}
293
294#[derive(Debug, Clone)]
296#[allow(clippy::exhaustive_enums)]
297pub enum NodeData {
298 Document,
300
301 Text(RefCell<StrTendril>),
303
304 Element(ElementData),
306
307 Other,
309}
310
311#[derive(Debug, Clone)]
313#[allow(clippy::exhaustive_structs)]
314pub struct ElementData {
315 pub name: QualName,
317
318 pub attrs: RefCell<BTreeSet<Attribute>>,
320}
321
322impl ElementData {
323 #[cfg(feature = "matrix")]
327 pub fn to_matrix(&self) -> matrix::MatrixElementData {
328 matrix::MatrixElementData::parse(&self.name, &self.attrs.borrow())
329 }
330}
331
332#[derive(Debug, Clone)]
334#[non_exhaustive]
335pub struct NodeRef(Rc<Node>);
336
337impl NodeRef {
338 fn new(data: NodeData) -> Self {
340 Self(Node::new(data).into())
341 }
342
343 pub(crate) fn detach(&self) {
345 if let Some((parent, index)) = self.parent_and_index() {
346 parent.0.children.borrow_mut().remove(index);
347 self.0.parent.take();
348 }
349 }
350
351 fn append_child(&self, child: NodeRef) {
355 child.detach();
356
357 child.0.parent.replace(Some(Rc::downgrade(&self.0)));
358 self.0.children.borrow_mut().push(child);
359 }
360
361 fn parent_and_index(&self) -> Option<(NodeRef, usize)> {
363 let parent = self.0.parent()?;
364 let i = parent
365 .0
366 .children
367 .borrow()
368 .iter()
369 .position(|child| Rc::ptr_eq(&child.0, &self.0))
370 .expect("child should be in parent's children");
371 Some((parent, i))
372 }
373
374 pub(crate) fn insert_before_sibling(&self, sibling: &NodeRef) {
378 self.detach();
379
380 let (parent, index) = sibling.parent_and_index().expect("sibling should have parent");
381
382 self.0.parent.replace(Some(Rc::downgrade(&parent.0)));
383 parent.0.children.borrow_mut().insert(index, self.clone());
384 }
385
386 pub(crate) fn replace_with_element_name(self, name: LocalName) -> NodeRef {
391 let mut element_data = self.as_element().unwrap().clone();
392 element_data.name.local = name;
393
394 let new_node = NodeRef::new(NodeData::Element(element_data));
395
396 for child in self.children() {
397 new_node.append_child(child);
398 }
399
400 new_node.insert_before_sibling(&self);
401 self.detach();
402
403 new_node
404 }
405
406 pub fn data(&self) -> &NodeData {
408 &self.0.data
409 }
410
411 pub fn as_element(&self) -> Option<&ElementData> {
413 self.0.as_element()
414 }
415
416 pub fn as_text(&self) -> Option<&RefCell<StrTendril>> {
418 self.0.as_text()
419 }
420
421 pub fn parent(&self) -> Option<NodeRef> {
425 let parent = self.0.parent()?;
426
427 if parent.0.is_root() {
429 return None;
430 }
431
432 Some(parent)
433 }
434
435 pub fn next_sibling(&self) -> Option<NodeRef> {
439 let (parent, index) = self.parent_and_index()?;
440 let index = index.checked_add(1)?;
441 let sibling = parent.0.children.borrow().get(index).cloned();
442 sibling
443 }
444
445 pub fn prev_sibling(&self) -> Option<NodeRef> {
449 let (parent, index) = self.parent_and_index()?;
450 let index = index.checked_sub(1)?;
451 let sibling = parent.0.children.borrow().get(index).cloned();
452 sibling
453 }
454
455 pub fn has_children(&self) -> bool {
457 !self.0.children.borrow().is_empty()
458 }
459
460 pub fn first_child(&self) -> Option<NodeRef> {
464 self.0.children.borrow().first().cloned()
465 }
466
467 pub fn last_child(&self) -> Option<NodeRef> {
471 self.0.children.borrow().last().cloned()
472 }
473
474 pub fn children(&self) -> Children {
476 Children::new(self.first_child())
477 }
478
479 pub(crate) fn serialize<S>(&self, serializer: &mut S) -> io::Result<()>
480 where
481 S: Serializer,
482 {
483 match self.data() {
484 NodeData::Element(data) => {
485 serializer.start_elem(
486 data.name.clone(),
487 data.attrs.borrow().iter().map(|attr| (&attr.name, &*attr.value)),
488 )?;
489
490 for child in self.children() {
491 child.serialize(serializer)?;
492 }
493
494 serializer.end_elem(data.name.clone())?;
495
496 Ok(())
497 }
498 NodeData::Document => {
499 for child in self.children() {
500 child.serialize(serializer)?;
501 }
502
503 Ok(())
504 }
505 NodeData::Text(text) => serializer.write_text(&text.borrow()),
506 _ => Ok(()),
507 }
508 }
509}
510
511#[derive(Debug, Clone)]
515pub struct Children {
516 next: Option<NodeRef>,
517}
518
519impl Children {
520 fn new(start_node: Option<NodeRef>) -> Self {
522 Self { next: start_node }
523 }
524}
525
526impl Iterator for Children {
527 type Item = NodeRef;
528
529 fn next(&mut self) -> Option<Self::Item> {
530 let next = self.next.take()?;
531 self.next = next.next_sibling();
532 Some(next)
533 }
534}
535
536impl FusedIterator for Children {}
537
538#[cfg(test)]
539mod tests {
540 use super::Html;
541
542 #[test]
543 fn sanity() {
544 let html = "\
545 <h1>Title</h1>\
546 <div>\
547 <p>This is some <em>text</em></p>\
548 </div>\
549 ";
550 assert_eq!(Html::parse(html).to_string(), html);
551
552 assert_eq!(Html::parse("").to_string(), "");
553 }
554}