1use std::{
2 cell::RefCell,
3 collections::BTreeSet,
4 fmt, io,
5 iter::FusedIterator,
6 rc::{Rc, Weak},
7};
8
9use as_variant::as_variant;
10use html5ever::{
11 local_name, ns, parse_fragment,
12 serialize::{serialize, Serialize, SerializeOpts, Serializer, TraversalScope},
13 tendril::{StrTendril, TendrilSink},
14 tree_builder::{NodeOrText, TreeSink},
15 Attribute, LocalName, ParseOpts, QualName,
16};
17use tracing::debug;
18
19#[cfg(feature = "matrix")]
20pub mod matrix;
21
22use crate::SanitizerConfig;
23
24#[derive(Debug)]
29pub struct Html {
30 document: NodeRef,
31}
32
33impl Html {
34 pub fn parse(string: &str) -> Self {
39 let sink = Self::default();
40 let mut parser = parse_fragment(
41 sink,
42 ParseOpts::default(),
43 QualName::new(None, ns!(html), local_name!("div")),
44 Vec::new(),
45 true,
46 );
47 parser.process(string.into());
48 parser.finish()
49 }
50
51 pub fn sanitize(&self) {
56 let config = SanitizerConfig::compat().remove_reply_fallback();
57 self.sanitize_with(&config);
58 }
59
60 pub fn sanitize_with(&self, config: &SanitizerConfig) {
62 config.clean(self);
63 }
64
65 fn root(&self) -> NodeRef {
67 self.document.first_child().expect("html should always have a root node")
68 }
69
70 pub fn has_children(&self) -> bool {
72 self.root().has_children()
73 }
74
75 pub fn first_child(&self) -> Option<NodeRef> {
79 self.root().first_child()
80 }
81
82 pub fn last_child(&self) -> Option<NodeRef> {
86 self.root().last_child()
87 }
88
89 pub fn children(&self) -> Children {
91 Children::new(self.first_child())
92 }
93}
94
95impl Default for Html {
96 fn default() -> Self {
97 Self { document: NodeRef::new(NodeData::Document) }
98 }
99}
100
101impl TreeSink for Html {
102 type Handle = NodeRef;
103 type Output = Self;
104 type ElemName<'a> = html5ever::ExpandedName<'a>;
105
106 fn finish(self) -> Self::Output {
107 self
108 }
109
110 fn parse_error(&self, msg: std::borrow::Cow<'static, str>) {
111 debug!("HTML parse error: {msg}");
112 }
113
114 fn get_document(&self) -> Self::Handle {
115 self.document.clone()
116 }
117
118 fn elem_name<'a>(&'a self, target: &'a Self::Handle) -> html5ever::ExpandedName<'a> {
119 target.as_element().expect("not an element").name.expanded()
120 }
121
122 fn create_element(
123 &self,
124 name: QualName,
125 attrs: Vec<Attribute>,
126 _flags: html5ever::tree_builder::ElementFlags,
127 ) -> Self::Handle {
128 NodeRef::new(NodeData::Element(ElementData {
129 name,
130 attrs: RefCell::new(attrs.into_iter().collect()),
131 }))
132 }
133
134 fn create_comment(&self, _text: StrTendril) -> Self::Handle {
135 NodeRef::new(NodeData::Other)
136 }
137
138 fn create_pi(&self, _target: StrTendril, _data: StrTendril) -> Self::Handle {
139 NodeRef::new(NodeData::Other)
140 }
141
142 fn append(&self, parent: &Self::Handle, child: NodeOrText<Self::Handle>) {
143 match child {
144 NodeOrText::AppendNode(node) => parent.append_child(node),
145 NodeOrText::AppendText(text) => {
146 if let Some(prev_text) =
148 parent.last_child().as_ref().and_then(|sibling| sibling.as_text())
149 {
150 prev_text.borrow_mut().push_tendril(&text);
151 } else {
152 let node = NodeRef::new(NodeData::Text(text.into()));
153 parent.append_child(node);
154 }
155 }
156 }
157 }
158
159 fn append_based_on_parent_node(
160 &self,
161 element: &Self::Handle,
162 prev_element: &Self::Handle,
163 child: NodeOrText<Self::Handle>,
164 ) {
165 if element.0.parent.borrow().is_some() {
166 self.append_before_sibling(element, child);
167 } else {
168 self.append(prev_element, child);
169 }
170 }
171
172 fn append_doctype_to_document(
173 &self,
174 _name: StrTendril,
175 _public_id: StrTendril,
176 _system_id: StrTendril,
177 ) {
178 }
179
180 fn get_template_contents(&self, target: &Self::Handle) -> Self::Handle {
181 target.clone()
182 }
183
184 fn same_node(&self, x: &Self::Handle, y: &Self::Handle) -> bool {
185 Rc::ptr_eq(&x.0, &y.0)
186 }
187
188 fn set_quirks_mode(&self, _mode: html5ever::tree_builder::QuirksMode) {}
189
190 fn append_before_sibling(&self, sibling: &Self::Handle, new_node: NodeOrText<Self::Handle>) {
191 match new_node {
192 NodeOrText::AppendNode(node) => node.insert_before_sibling(sibling),
193 NodeOrText::AppendText(text) => {
194 if let Some(prev_text) =
196 sibling.prev_sibling().as_ref().and_then(|prev_sibling| prev_sibling.as_text())
197 {
198 prev_text.borrow_mut().push_tendril(&text);
199 } else {
200 let node = NodeRef::new(NodeData::Text(text.into()));
201 node.insert_before_sibling(sibling);
202 }
203 }
204 }
205 }
206
207 fn add_attrs_if_missing(&self, target: &Self::Handle, attrs: Vec<Attribute>) {
208 let element = target.as_element().unwrap();
209 element.attrs.borrow_mut().extend(attrs);
210 }
211
212 fn remove_from_parent(&self, target: &Self::Handle) {
213 target.detach();
214 }
215
216 fn reparent_children(&self, node: &Self::Handle, new_parent: &Self::Handle) {
217 for child in node.0.children.take() {
218 child.0.parent.take();
219 new_parent.append_child(child);
220 }
221 }
222}
223
224impl Serialize for Html {
225 fn serialize<S>(&self, serializer: &mut S, traversal_scope: TraversalScope) -> io::Result<()>
226 where
227 S: Serializer,
228 {
229 match traversal_scope {
230 TraversalScope::IncludeNode => {
231 for child in self.children() {
232 child.serialize(serializer)?;
233 }
234
235 Ok(())
236 }
237 TraversalScope::ChildrenOnly(_) => Ok(()),
238 }
239 }
240}
241
242impl fmt::Display for Html {
243 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
244 let mut u8_vec = Vec::new();
245 serialize(
246 &mut u8_vec,
247 self,
248 SerializeOpts { traversal_scope: TraversalScope::IncludeNode, ..Default::default() },
249 )
250 .unwrap();
251
252 f.write_str(&String::from_utf8(u8_vec).unwrap())?;
253
254 Ok(())
255 }
256}
257
258#[derive(Debug)]
260#[non_exhaustive]
261struct Node {
262 parent: RefCell<Option<Weak<Node>>>,
263 children: RefCell<Vec<NodeRef>>,
264 data: NodeData,
265}
266
267impl Node {
268 fn new(data: NodeData) -> Self {
270 Self { parent: Default::default(), children: Default::default(), data }
271 }
272
273 fn as_element(&self) -> Option<&ElementData> {
275 as_variant!(&self.data, NodeData::Element)
276 }
277
278 fn as_text(&self) -> Option<&RefCell<StrTendril>> {
280 as_variant!(&self.data, NodeData::Text)
281 }
282
283 fn is_root(&self) -> bool {
285 matches!(&self.data, NodeData::Element(element_data) if element_data.name.local.as_bytes() == b"html")
287 }
288
289 fn parent(&self) -> Option<NodeRef> {
291 self.parent.borrow().as_ref()?.upgrade().map(NodeRef)
292 }
293}
294
295#[derive(Debug, Clone)]
297#[allow(clippy::exhaustive_enums)]
298pub enum NodeData {
299 Document,
301
302 Text(RefCell<StrTendril>),
304
305 Element(ElementData),
307
308 Other,
310}
311
312#[derive(Debug, Clone)]
314#[allow(clippy::exhaustive_structs)]
315pub struct ElementData {
316 pub name: QualName,
318
319 pub attrs: RefCell<BTreeSet<Attribute>>,
321}
322
323impl ElementData {
324 #[cfg(feature = "matrix")]
328 pub fn to_matrix(&self) -> matrix::MatrixElementData {
329 matrix::MatrixElementData::parse(&self.name, &self.attrs.borrow())
330 }
331}
332
333#[derive(Debug, Clone)]
335#[non_exhaustive]
336pub struct NodeRef(Rc<Node>);
337
338impl NodeRef {
339 fn new(data: NodeData) -> Self {
341 Self(Node::new(data).into())
342 }
343
344 pub(crate) fn detach(&self) {
346 if let Some((parent, index)) = self.parent_and_index() {
347 parent.0.children.borrow_mut().remove(index);
348 self.0.parent.take();
349 }
350 }
351
352 fn append_child(&self, child: NodeRef) {
356 child.detach();
357
358 child.0.parent.replace(Some(Rc::downgrade(&self.0)));
359 self.0.children.borrow_mut().push(child);
360 }
361
362 fn parent_and_index(&self) -> Option<(NodeRef, usize)> {
364 let parent = self.0.parent()?;
365 let i = parent
366 .0
367 .children
368 .borrow()
369 .iter()
370 .position(|child| Rc::ptr_eq(&child.0, &self.0))
371 .expect("child should be in parent's children");
372 Some((parent, i))
373 }
374
375 pub(crate) fn insert_before_sibling(&self, sibling: &NodeRef) {
379 self.detach();
380
381 let (parent, index) = sibling.parent_and_index().expect("sibling should have parent");
382
383 self.0.parent.replace(Some(Rc::downgrade(&parent.0)));
384 parent.0.children.borrow_mut().insert(index, self.clone());
385 }
386
387 pub(crate) fn replace_with_element_name(self, name: LocalName) -> NodeRef {
392 let mut element_data = self.as_element().unwrap().clone();
393 element_data.name.local = name;
394
395 let new_node = NodeRef::new(NodeData::Element(element_data));
396
397 for child in self.children() {
398 new_node.append_child(child);
399 }
400
401 new_node.insert_before_sibling(&self);
402 self.detach();
403
404 new_node
405 }
406
407 pub fn data(&self) -> &NodeData {
409 &self.0.data
410 }
411
412 pub fn as_element(&self) -> Option<&ElementData> {
414 self.0.as_element()
415 }
416
417 pub fn as_text(&self) -> Option<&RefCell<StrTendril>> {
419 self.0.as_text()
420 }
421
422 pub fn parent(&self) -> Option<NodeRef> {
426 let parent = self.0.parent()?;
427
428 if parent.0.is_root() {
430 return None;
431 }
432
433 Some(parent)
434 }
435
436 pub fn next_sibling(&self) -> Option<NodeRef> {
440 let (parent, index) = self.parent_and_index()?;
441 let index = index.checked_add(1)?;
442 let sibling = parent.0.children.borrow().get(index).cloned();
443 sibling
444 }
445
446 pub fn prev_sibling(&self) -> Option<NodeRef> {
450 let (parent, index) = self.parent_and_index()?;
451 let index = index.checked_sub(1)?;
452 let sibling = parent.0.children.borrow().get(index).cloned();
453 sibling
454 }
455
456 pub fn has_children(&self) -> bool {
458 !self.0.children.borrow().is_empty()
459 }
460
461 pub fn first_child(&self) -> Option<NodeRef> {
465 self.0.children.borrow().first().cloned()
466 }
467
468 pub fn last_child(&self) -> Option<NodeRef> {
472 self.0.children.borrow().last().cloned()
473 }
474
475 pub fn children(&self) -> Children {
477 Children::new(self.first_child())
478 }
479
480 pub(crate) fn serialize<S>(&self, serializer: &mut S) -> io::Result<()>
481 where
482 S: Serializer,
483 {
484 match self.data() {
485 NodeData::Element(data) => {
486 serializer.start_elem(
487 data.name.clone(),
488 data.attrs.borrow().iter().map(|attr| (&attr.name, &*attr.value)),
489 )?;
490
491 for child in self.children() {
492 child.serialize(serializer)?;
493 }
494
495 serializer.end_elem(data.name.clone())?;
496
497 Ok(())
498 }
499 NodeData::Document => {
500 for child in self.children() {
501 child.serialize(serializer)?;
502 }
503
504 Ok(())
505 }
506 NodeData::Text(text) => serializer.write_text(&text.borrow()),
507 _ => Ok(()),
508 }
509 }
510}
511
512#[derive(Debug, Clone)]
516pub struct Children {
517 next: Option<NodeRef>,
518}
519
520impl Children {
521 fn new(start_node: Option<NodeRef>) -> Self {
523 Self { next: start_node }
524 }
525}
526
527impl Iterator for Children {
528 type Item = NodeRef;
529
530 fn next(&mut self) -> Option<Self::Item> {
531 let next = self.next.take()?;
532 self.next = next.next_sibling();
533 Some(next)
534 }
535}
536
537impl FusedIterator for Children {}
538
539#[cfg(test)]
540mod tests {
541 use super::Html;
542
543 #[test]
544 fn sanity() {
545 let html = "\
546 <h1>Title</h1>\
547 <div>\
548 <p>This is some <em>text</em></p>\
549 </div>\
550 ";
551 assert_eq!(Html::parse(html).to_string(), html);
552
553 assert_eq!(Html::parse("").to_string(), "");
554 }
555}