Coverage for website / builder / markdown.py: 85%
336 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:37 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-10 09:37 +0000
1"""
2Markdown Processing - Markdown-to-HTML Conversion.
4This module handles markdown processing, HTML conversion,
5and content formatting for the website builder.
6"""
8import re
11class MarkdownProcessor:
12 """Handles markdown processing and HTML conversion."""
14 def markdown_to_html(
15 self, markdown_content: str, source_file: str = "", output_file: str = ""
16 ) -> str:
17 """Convert markdown to HTML with Bootstrap styling."""
18 # Normalize empty/whitespace-only content consistently across code paths
19 if not markdown_content.strip():
20 return ""
21 try:
22 import markdown
24 md = markdown.Markdown(
25 extensions=[
26 "fenced_code",
27 "codehilite",
28 "tables",
29 "toc",
30 "attr_list",
31 "def_list",
32 "footnotes",
33 "md_in_html",
34 "sane_lists",
35 ],
36 extension_configs={
37 "codehilite": {
38 "css_class": "codehilite",
39 "use_pygments": False, # Use simple highlighting without Pygments
40 "guess_lang": True,
41 }
42 },
43 )
44 html = md.convert(markdown_content)
46 # Fix any remaining malformed code blocks
47 html = self.fix_malformed_code_blocks(html)
49 # Add Bootstrap classes
50 html = self.add_bootstrap_classes(html)
52 # Render GitHub-style task list markers as clickable checkboxes
53 html = self.render_task_list_checkboxes(html)
55 # Ensure heading IDs
56 html = self.ensure_heading_ids(html)
58 return html
60 except ImportError:
61 # Fallback to basic conversion
62 html = self._basic_markdown_to_html_no_regex(markdown_content)
63 # Apply Bootstrap classes to fallback HTML too
64 html = self.add_bootstrap_classes(html)
65 # Render task lists in fallback mode too
66 html = self.render_task_list_checkboxes(html)
67 # Ensure heading IDs
68 html = self.ensure_heading_ids(html)
69 return html
71 def _basic_markdown_to_html_no_regex(self, markdown_content: str) -> str:
72 """Basic markdown to HTML conversion without regex."""
73 content = markdown_content
74 if not content.strip():
75 return ""
77 def transform_inline(text: str) -> str:
78 # Bold (strong) and italics (em)
79 text = re.sub(
80 r"\*\*([^*]+)\*\*", lambda m: f"<strong>{m.group(1)}</strong>", text
81 )
82 text = re.sub(r"\*([^*]+)\*", lambda m: f"<em>{m.group(1)}</em>", text)
83 # Inline code
84 text = re.sub(r"`([^`]+)`", lambda m: f"<code>{m.group(1)}</code>", text)
85 # Links [text](url)
86 text = re.sub(
87 r"\[([^\]]+)\]\(([^)]+)\)",
88 lambda m: f'<a href="{m.group(2)}">{m.group(1)}</a>',
89 text,
90 )
91 return text
93 lines = content.split("\n")
94 html_lines: list[str] = []
95 in_code_block = False
96 in_list = False
98 for line in lines:
99 raw = line.rstrip("\n")
100 if raw.startswith("```"):
101 if in_code_block:
102 html_lines.append("</code></pre>")
103 in_code_block = False
104 else:
105 # close any open list before starting code block
106 if in_list:
107 html_lines.append("</ul>")
108 in_list = False
109 html_lines.append("<pre><code>")
110 in_code_block = True
111 continue
113 if in_code_block:
114 html_lines.append(raw)
115 continue
117 # Headings
118 if raw.startswith("# "):
119 if in_list:
120 html_lines.append("</ul>")
121 in_list = False
122 html_lines.append(f"<h1>{transform_inline(raw[2:])}</h1>")
123 continue
124 if raw.startswith("## "):
125 if in_list:
126 html_lines.append("</ul>")
127 in_list = False
128 html_lines.append(f"<h2>{transform_inline(raw[3:])}</h2>")
129 continue
130 if raw.startswith("### "):
131 if in_list:
132 html_lines.append("</ul>")
133 in_list = False
134 html_lines.append(f"<h3>{transform_inline(raw[4:])}</h3>")
135 continue
136 if raw.startswith("#### "):
137 if in_list:
138 html_lines.append("</ul>")
139 in_list = False
140 html_lines.append(f"<h4>{transform_inline(raw[5:])}</h4>")
141 continue
142 if raw.startswith("##### "):
143 if in_list:
144 html_lines.append("</ul>")
145 in_list = False
146 html_lines.append(f"<h5>{transform_inline(raw[6:])}</h5>")
147 continue
148 if raw.startswith("###### "):
149 if in_list:
150 html_lines.append("</ul>")
151 in_list = False
152 html_lines.append(f"<h6>{transform_inline(raw[7:])}</h6>")
153 continue
155 # Lists
156 if raw.lstrip().startswith("- "):
157 if not in_list:
158 html_lines.append("<ul>")
159 in_list = True
160 item_text = raw.lstrip()[2:]
161 html_lines.append(f"<li>{transform_inline(item_text)}</li>")
162 continue
163 else:
164 if in_list and raw.strip() == "":
165 html_lines.append("</ul>")
166 in_list = False
168 # Paragraphs
169 if raw.strip():
170 html_lines.append(f"<p>{transform_inline(raw)}</p>")
172 # Close any open list
173 if in_list:
174 html_lines.append("</ul>")
176 # Join and strip extraneous blank lines
177 html = "\n".join([h for h in html_lines if h is not None])
178 # Apply Bootstrap classes and heading IDs
179 return html
181 def fix_malformed_code_blocks(self, html_content: str) -> str:
182 """Fix code blocks that weren't properly converted by markdown."""
184 # Fix single-line code snippets that should be code blocks
185 # Convert paragraphs with inline code containing bash commands to proper code blocks
186 html_content = re.sub(
187 r'<p><code class="inline-code">(bash|sh)\s*\n\s*([^<]+)</code></p>',
188 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-\1">\2</code></pre></div>',
189 html_content,
190 )
192 # Fix paragraphs with bash/shell commands (with or without language prefix)
193 html_content = re.sub(
194 r'<p><code class="inline-code">(?:bash\s*\n\s*)?([^<]*(?:mkdir|cd|pip|qdrant-loader|mcp-)[^<]*)</code></p>',
195 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-bash">\1</code></pre></div>',
196 html_content,
197 )
199 # Also handle cases where there's no class attribute
200 html_content = re.sub(
201 r"<p><code>(?:bash\s*\n\s*)?([^<]*(?:mkdir|cd|pip|qdrant-loader|mcp-)[^<]*)</code></p>",
202 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-bash">\1</code></pre></div>',
203 html_content,
204 )
206 # Clean up stray <p> tags inside code blocks
207 html_content = re.sub(
208 r"(<code[^>]*>.*?)</p>\s*<p>(.*?</code>)",
209 r"\1\n\2",
210 html_content,
211 flags=re.DOTALL,
212 )
214 # Fix paragraphs that contain triple backticks (malformed code blocks)
215 def fix_code_block(match):
216 content = match.group(1)
217 # Extract language if present
218 lines = content.split("\n")
219 first_line = lines[0].strip()
220 if first_line.startswith("```"):
221 language = first_line[3:].strip()
222 code_content = "\n".join(lines[1:])
223 # Remove trailing ``` if present
224 if code_content.endswith("```"):
225 code_content = code_content[:-3].rstrip()
226 return f'<div class="code-block-wrapper"><pre class="code-block"><code class="language-{language}">{code_content}</code></pre></div>'
227 return match.group(0)
229 # Match paragraphs containing code blocks
230 html_content = re.sub(
231 r"<p>(```[^`]*```)</p>", fix_code_block, html_content, flags=re.DOTALL
232 )
234 # Handle multi-paragraph code blocks
235 html_content = re.sub(
236 r"<p>```(\w+)\s*</p>\s*<p>(.*?)</p>\s*<p>```</p>",
237 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-\1">\2</code></pre></div>',
238 html_content,
239 flags=re.DOTALL,
240 )
242 # Handle code blocks split across multiple paragraphs
243 html_content = re.sub(
244 r"<p>```(\w+)?\s*(.*?)\s*```</p>",
245 lambda m: f'<div class="code-block-wrapper"><pre class="code-block"><code class="language-{m.group(1) or ""}">{m.group(2)}</code></pre></div>',
246 html_content,
247 flags=re.DOTALL,
248 )
250 return html_content
252 def ensure_heading_ids(self, html_content: str) -> str:
253 """Ensure all headings have IDs for anchor links."""
255 def slugify(text: str) -> str:
256 """Convert text to URL-safe slug."""
257 import re
259 slug = re.sub(r"[^\w\s-]", "", text.lower())
260 return re.sub(r"[-\s]+", "-", slug).strip("-")
262 def _extract_text(html: str) -> str:
263 """Return visible text for a piece of HTML (fall back to img alt)."""
264 # Remove tags to get visible text
265 text_only = re.sub(r"<[^>]+>", "", html).strip()
266 if text_only:
267 return text_only
268 # If no visible text, try to get alt from first <img>
269 m = re.search(r'<img[^>]*alt=["\']([^"\']+)["\']', html)
270 if m:
271 return m.group(1).strip()
272 return ""
274 def add_id(match: re.Match) -> str:
275 """Add ID to heading if not present."""
276 tag = match.group(1)
277 attrs = match.group(2) or ""
278 content = match.group(3) or ""
280 if "id=" not in attrs:
281 visible = _extract_text(content)
282 heading_id = slugify(visible or content)
283 if attrs:
284 attrs = f' id="{heading_id}" {attrs.strip()}'
285 else:
286 attrs = f' id="{heading_id}"'
288 return f"<{tag}{attrs}>{content}</{tag}>"
290 # Match headings even when they contain HTML inside
291 heading_pattern = r"<(h[1-6])([^>]*)>(.*?)</h[1-6]>"
292 return re.sub(heading_pattern, add_id, html_content, flags=re.DOTALL)
294 def add_bootstrap_classes(self, html_content: str) -> str:
295 """Add Bootstrap classes to HTML elements."""
297 # Add Bootstrap header classes
298 html_content = re.sub(
299 r"<h1([^>]*)>",
300 r'<h1\1 class="display-4 fw-bold text-primary mb-4">',
301 html_content,
302 )
303 html_content = re.sub(
304 r"<h2([^>]*)>",
305 r'<h2\1 class="h2 fw-bold text-primary mt-5 mb-3">',
306 html_content,
307 )
308 html_content = re.sub(
309 r"<h3([^>]*)>",
310 r'<h3\1 class="h3 fw-bold text-primary mt-5 mb-3">',
311 html_content,
312 )
313 html_content = re.sub(
314 r"<h4([^>]*)>", r'<h4\1 class="h4 fw-bold mt-4 mb-3">', html_content
315 )
316 html_content = re.sub(
317 r"<h5([^>]*)>", r'<h5\1 class="h5 fw-bold mt-3 mb-2">', html_content
318 )
319 html_content = re.sub(
320 r"<h6([^>]*)>", r'<h6\1 class="h6 fw-semibold mt-2 mb-1">', html_content
321 )
323 # Add Bootstrap code block classes - clean approach
324 # First handle codehilite divs
325 html_content = re.sub(
326 r'<div class="codehilite">',
327 '<div class="code-block-wrapper">',
328 html_content,
329 )
331 # Handle standalone pre blocks (not already in wrappers)
332 html_content = re.sub(
333 r'(?<!<div class="code-block-wrapper">)<pre>',
334 '<div class="code-block-wrapper"><pre class="code-block">',
335 html_content,
336 )
338 # Add code-block class to pre tags that don't have it
339 html_content = re.sub(
340 r'<pre(?![^>]*class="code-block")([^>]*)>',
341 r'<pre class="code-block"\1>',
342 html_content,
343 )
345 # Close wrapper divs only for pre blocks that we wrapped
346 html_content = re.sub(
347 r'(<div class="code-block-wrapper"><pre class="code-block"[^>]*>.*?)</pre>(?!</div>)',
348 r"\1</pre></div>",
349 html_content,
350 flags=re.DOTALL,
351 )
352 # Add Bootstrap inline code classes
353 # First handle code blocks, then inline code
354 html_content = re.sub(
355 r"<code>",
356 '<code class="inline-code">',
357 html_content,
358 )
359 # Override inline-code class for code inside pre blocks
360 html_content = re.sub(
361 r'(<pre[^>]*>.*?)<code class="inline-code">',
362 r"\1<code>",
363 html_content,
364 flags=re.DOTALL,
365 )
367 # Add Bootstrap link classes
368 html_content = re.sub(
369 r'<a([^>]*?)href="([^"]*)"([^>]*?)>',
370 r'<a\1href="\2"\3 class="text-decoration-none">',
371 html_content,
372 )
374 # Add Bootstrap list classes
375 html_content = re.sub(
376 r"<ul>", '<ul class="list-group list-group-flush">', html_content
377 )
378 html_content = re.sub(
379 r"<ol>", '<ol class="list-group list-group-numbered">', html_content
380 )
381 html_content = re.sub(r"<li>", '<li class="list-group-item">', html_content)
383 # Add Bootstrap table classes
384 html_content = re.sub(
385 r"<table>", '<table class="table table-striped table-hover">', html_content
386 )
388 # Add Bootstrap alert classes for blockquotes
389 html_content = re.sub(
390 r"<blockquote>", '<blockquote class="alert alert-info">', html_content
391 )
393 # Add Bootstrap button classes to links that look like buttons
394 html_content = re.sub(
395 r'<a([^>]*?)class="[^"]*btn[^"]*"([^>]*?)>',
396 r'<a\1class="btn btn-primary"\2>',
397 html_content,
398 )
400 return html_content
402 def render_task_list_checkboxes(self, html_content: str) -> str:
403 """Render markdown task-list markers as checkbox inputs."""
405 def add_class(attrs: str, class_name: str) -> str:
406 class_match = re.search(r'class="([^"]*)"', attrs)
407 if class_match:
408 classes = class_match.group(1).split()
409 if class_name not in classes:
410 classes.append(class_name)
411 return re.sub(
412 r'class="([^"]*)"', f'class="{" ".join(classes)}"', attrs
413 )
414 return f'{attrs} class="{class_name}"'
416 def replace_task_item(match: re.Match) -> str:
417 attrs = match.group("attrs") or ""
418 marker = match.group("marker")
419 body = match.group("body")
420 checked_attr = " checked" if marker.lower() == "x" else ""
421 attrs = add_class(attrs, "task-list-item")
422 return (
423 f'<li{attrs}>'
424 f'<input class="form-check-input me-2" type="checkbox"{checked_attr} disabled>'
425 f"{body}</li>"
426 )
428 return re.sub(
429 r"<li(?P<attrs>[^>]*)>\s*\[(?P<marker>[ xX])\]\s*(?P<body>.*?)</li>",
430 replace_task_item,
431 html_content,
432 flags=re.DOTALL,
433 )
435 def extract_title_from_markdown(self, markdown_content: str) -> str:
436 """Extract title from markdown content."""
437 lines = markdown_content.split("\n")
438 for line in lines:
439 line = line.strip()
440 if line.startswith("# "):
441 return line[2:].strip()
442 return "Documentation" # Default fallback title
444 def basic_markdown_to_html(self, markdown_content: str) -> str:
445 """Basic markdown to HTML conversion - alias for compatibility."""
446 return self.markdown_to_html(markdown_content)
448 def convert_markdown_links_to_html(
449 self, content: str, source_file: str = "", target_dir: str = ""
450 ) -> str:
451 """Convert markdown links to HTML format."""
453 # Convert [text](link.md) to [text](link.html) - markdown style
454 def replace_md_links(match):
455 text = match.group(1)
456 link = match.group(2)
457 link = self._process_link_path(link, source_file)
458 return f"[{text}]({link})"
460 # Convert href="link.md" to href="link.html" - HTML style
461 def replace_href_links(match):
462 prefix = match.group(1)
463 link = match.group(2)
464 suffix = match.group(3)
465 link = self._process_link_path(link, source_file)
466 return f"{prefix}{link}{suffix}"
468 # Apply conversions - expanded patterns to catch more file types
469 # Catch .md files and well-known files without extensions
470 well_known_link_pattern_md = (
471 r"\[([^\]]+)\]\(((?:(?:\.\./)+|\./|/)?"
472 r"(?:LICENSE|README|CHANGELOG|CONTRIBUTING)(?:/[^)]*)?(?:#[^)]*)?)\)"
473 )
474 well_known_link_pattern_href = (
475 r'(href=")((?:(?:\.\./)+|\./|/)?'
476 r'(?:LICENSE|README|CHANGELOG|CONTRIBUTING)(?:/[^"]*)?(?:#[^"]*)?)(")'
477 )
479 content = re.sub(
480 r"\[([^\]]+)\]\(([^)]+\.md(?:#[^)]*)?)\)", replace_md_links, content
481 )
482 content = re.sub(
483 well_known_link_pattern_md,
484 replace_md_links,
485 content,
486 )
487 content = re.sub(
488 r'(href=")([^"]+\.md(?:#[^"]*)?)(")', replace_href_links, content
489 )
490 content = re.sub(
491 well_known_link_pattern_href,
492 replace_href_links,
493 content,
494 )
496 # The following normalizations are only applied during site builds (when source_file is provided).
497 # Unit tests expect relative paths to be preserved.
498 if source_file:
499 # Normalize links that incorrectly include an extra "/docs/" prefix inside /docs pages
500 # e.g., href="docs/users/..." when already under /docs/ -> make it absolute "/docs/users/..."
501 content = re.sub(r'(href=")(docs/[^"]+)(")', r"\1/\2\3", content)
502 content = re.sub(r"\]\((docs/[^)]+)\)", r"](/\1)", content)
504 # Collapse accidental duplicate docs/docs prefixes
505 content = re.sub(
506 r'(href=")/?docs/docs/([^"]+)(")', r"\1/docs/\2\3", content
507 )
508 content = re.sub(r"\]\(/?docs/docs/([^\)]+)\)", r"](/docs/\1)", content)
510 # Rewrite relative ./docs/... links to absolute /docs/ (HTML and Markdown)
511 content = re.sub(
512 r'(href=")\./docs/([^"#]*)(#[^"]*)?(")', r"\1/docs/\2\3\4", content
513 )
514 content = re.sub(
515 r"\]\(\./docs/([^\)#]*)(#[^\)]*)?\)", r"](/docs/\1\2)", content
516 )
518 # Rewrite relative ../../docs/... links to absolute /docs/ (HTML and Markdown)
519 content = re.sub(
520 r'(href=")(?:\.{2}/)+docs/([^"#]*)(#[^"]*)?(")',
521 r"\1/docs/\2\3\4",
522 content,
523 )
524 content = re.sub(
525 r"\]\((?:\.{2}/)+docs/([^\)#]*)(#[^\)]*)?\)", r"](/docs/\1\2)", content
526 )
528 # Convert .md (with optional anchors) to .html in both HTML and Markdown links
529 content = re.sub(
530 r'(href=")([^"\s]+)\.md(#[^"]*)?(")',
531 lambda m: f"{m.group(1)}{m.group(2)}.html{m.group(3) or ''}{m.group(4)}",
532 content,
533 )
534 content = re.sub(
535 r"\]\(([^\)\s]+)\.md(#[^\)]*)?\)",
536 lambda m: f"]({m.group(1)}.html{m.group(2) or ''})",
537 content,
538 )
540 # Normalize developers relative links to directory indexes
541 content = re.sub(
542 r'(href=")\./(architecture|testing|deployment|extending)\.html(")',
543 r"\1./\2/\3",
544 content,
545 )
546 # Normalize absolute developers/*.html to directory indexes
547 content = re.sub(
548 r'(href=")([^"\s]*/developers/)(architecture|testing|deployment|extending)\.html(")',
549 r"\1\2\3/\4",
550 content,
551 )
552 content = re.sub(
553 r"\]\(([^\)\s]*/developers/)(architecture|testing|deployment|extending)\.html\)",
554 r"](\1\2/)",
555 content,
556 )
557 # Normalize parent-relative developers links like ../extending.html to ../extending/
558 content = re.sub(
559 r'(href=")([^"#]*/developers/)(architecture|testing|deployment|extending)\.html(#[^"]*)?(")',
560 r"\1\2\3/\4\5",
561 content,
562 )
563 # Normalize sibling links such as ../extending.html -> ../extending/
564 content = re.sub(
565 r'(href=")\.\./(architecture|testing|deployment|extending)\.html(#[^"]*)?(")',
566 r"\1../\2/\3\4",
567 content,
568 )
569 content = re.sub(
570 r"\]\(\.\./(architecture|testing|deployment|extending)\.html(#[^\)]*)?\)",
571 r"](../\1/\2)",
572 content,
573 )
575 # Ensure well-known repo root files under /docs have .html extension
576 content = re.sub(
577 r'(href=")(/docs/(?:LICENSE|README|CHANGELOG|CONTRIBUTING))(#[^"]*)?(")',
578 r"\1\2.html\3\4",
579 content,
580 )
582 # If a target output path is provided, convert absolute /docs/... links to relative ones
583 if target_dir:
584 try:
585 import posixpath
587 base_dir = target_dir
588 if not base_dir.endswith("/"):
589 base_dir = posixpath.dirname(base_dir) + "/"
591 def _to_relative_html(match: re.Match) -> str:
592 prefix, path_part, anchor, suffix = (
593 match.group(1),
594 match.group(2),
595 match.group(3) or "",
596 match.group(4),
597 )
598 abs_path = "docs/" + path_part
599 rel = posixpath.relpath(abs_path, base_dir.rstrip("/"))
600 return f'{prefix}{rel}{anchor or ""}{suffix}'
602 def _to_relative_md(match: re.Match) -> str:
603 path_part, anchor = match.group(1), match.group(2) or ""
604 abs_path = "docs/" + path_part
605 rel = posixpath.relpath(abs_path, base_dir.rstrip("/"))
606 return f"]({rel}{anchor})"
608 content = re.sub(
609 r'(href=")/docs/([^"#]+)(#[^"]*)?(")',
610 _to_relative_html,
611 content,
612 )
613 content = re.sub(
614 r"\]\(/docs/([^\)#]+)(#[^\)]*)?\)", _to_relative_md, content
615 )
616 except Exception:
617 # Fallback silently if relative conversion fails
618 pass
620 return content
622 def _process_link_path(self, link: str, source_file: str = "") -> str:
623 """Process a link path for conversion."""
624 # Preserve anchor fragments while processing
625 anchor = ""
626 if "#" in link:
627 link, anchor = link.split("#", 1)
628 anchor = "#" + anchor
630 # Only rewrite to absolute /docs when building from a source file context
631 if source_file:
632 # ../../docs/... -> /docs/...
633 link = re.sub(r"^(?:\.{2}/)+docs/", "/docs/", link)
634 # ./docs/... -> /docs/...
635 link = re.sub(r"^\./docs/", "/docs/", link)
636 # docs/... (relative) -> /docs/...
637 if link.startswith("docs/"):
638 link = "/" + link
640 # Decide whether to convert .md to .html (preserving anchors)
641 should_convert_md = True
642 if anchor and "/" not in link and not source_file:
643 # Preserve bare filename.md#anchor in tests (no source context)
644 should_convert_md = False
646 if link.endswith(".md") and should_convert_md:
647 link = link[:-3] + ".html"
648 else:
649 # Handle well-known files without extensions
650 filename = link.split("/")[-1]
651 if (
652 filename.upper() in ["LICENSE", "README", "CHANGELOG", "CONTRIBUTING"]
653 and "." not in filename
654 ):
655 # Ensure these resolve under /docs when referenced from packages
656 if (
657 source_file
658 and not link.startswith("/docs/")
659 and filename.upper()
660 in ["LICENSE", "README", "CHANGELOG", "CONTRIBUTING"]
661 ):
662 # Nudge to /docs root for repo-wide files
663 link = "/docs/" + filename
664 link = link + ".html"
666 # Collapse accidental duplicate /docs/docs prefixes
667 link = re.sub(r"^/docs/docs/", "/docs/", link)
668 link = link.replace("docs/docs/", "docs/")
670 # Ensure absolute /docs/ links are normalized (only when building)
671 if source_file and link.startswith("docs/"):
672 link = "/" + link
674 return link + anchor
676 def render_toc(self, html_content: str) -> str:
677 """Generate table of contents from HTML headings."""
678 # Find all headings (capture inner HTML, allow multiline)
679 heading_pattern = r'<(h[1-6])[^>]*id="([^\"]+)"[^>]*>(.*?)</h[1-6]>'
680 headings = re.findall(heading_pattern, html_content, flags=re.DOTALL)
682 if not headings:
683 return ""
685 toc_html = '<div class="toc"><h3>Table of Contents</h3>'
687 # Build hierarchical structure
688 current_level = 0
689 open_lists = 0
691 import html as _html
693 # Default leaf icon (small, neutral color)
694 default_svg = (
695 '<svg class="toc-icon" style="width:1rem;height:1rem;object-fit:contain;vertical-align:middle;margin-right:0.35rem;" '
696 'width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">'
697 '<path d="M10 7.5C9.50555 7.5 9.0222 7.64662 8.61108 7.92133C8.19995 8.19603 7.87952 8.58648 7.6903 9.04329C7.50108 9.50011 7.45157 10.0028 7.54804 10.4877C7.6445 10.9727 7.8826 11.4181 8.23223 11.7678C8.58187 12.1174 9.02732 12.3555 9.51228 12.452C9.99723 12.5484 10.4999 12.4989 10.9567 12.3097C11.4135 12.1205 11.804 11.8 12.0787 11.3889C12.3534 10.9778 12.5 10.4945 12.5 10C12.5 9.33696 12.2366 8.70107 11.7678 8.23223C11.2989 7.76339 10.663 7.5 10 7.5ZM10 11.25C9.75277 11.25 9.5111 11.1767 9.30554 11.0393C9.09998 10.902 8.93976 10.7068 8.84515 10.4784C8.75054 10.2499 8.72579 9.99861 8.77402 9.75614C8.82225 9.51366 8.9413 9.29093 9.11612 9.11612C9.29093 8.9413 9.51366 8.82225 9.75614 8.77402C9.99861 8.72579 10.2499 8.75054 10.4784 8.84515C10.7068 8.93976 10.902 9.09998 11.0393 9.30554C11.1767 9.5111 11.25 9.75277 11.25 10C11.25 10.3315 11.1183 10.6495 10.8839 10.8839C10.6495 11.1183 10.3315 11.25 10 11.25Z" fill="#343330" />'
698 '</svg>'
699 )
701 for idx, (tag, heading_id, text) in enumerate(headings):
702 level = int(tag[1]) # Extract number from h1, h2, etc.
704 # Determine if this heading has child headings (deeper level) before next sibling
705 has_child = False
706 for next_tag, _next_id, _next_text in headings[idx + 1:]:
707 next_level = int(next_tag[1])
708 if next_level > level:
709 has_child = True
710 break
711 if next_level <= level:
712 break
714 # Handle level changes
715 if level > current_level:
716 # Open new nested lists for deeper levels
717 while current_level < level:
718 if current_level == 0:
719 toc_html += "<ul>"
720 else:
721 toc_html += "<ul>"
722 open_lists += 1
723 current_level += 1
724 elif level < current_level:
725 # Close lists for shallower levels
726 while current_level > level:
727 toc_html += "</ul>"
728 open_lists -= 1
729 current_level -= 1
731 # Extract first <img> if present and sanitize it for TOC display
732 icon_html = ""
733 img_match = re.search(r'(<img[^>]*>)', text, flags=re.DOTALL)
734 if img_match:
735 icon_html = img_match.group(1)
736 # Remove any on* handlers and javascript: hrefs for safety
737 icon_html = re.sub(r"\s(on\w+)\s*=\s*(\"[^\"]*\"|'[^']*')", "", icon_html)
738 icon_html = re.sub(r"javascript:\s*", "", icon_html, flags=re.IGNORECASE)
739 # Remove any existing size/style attributes so we can normalize appearance
740 icon_html = re.sub(r"\s(width|height)=\s*(\"[^\"]*\"|'[^']*')", "", icon_html)
741 icon_html = re.sub(r"\sstyle=\s*(\"[^\"]*\"|'[^']*')", "", icon_html)
742 # Ensure a small consistent size and spacing for TOC icons
743 # Add class toc-icon (append if class exists)
744 if re.search(r"\sclass=\s*\"[^\"]+\"", icon_html):
745 icon_html = re.sub(r"\sclass=\s*\"([^\"]+)\"", lambda m: f' class="{m.group(1)} toc-icon"', icon_html)
746 elif re.search(r"\sclass=\s*'[^']+'", icon_html):
747 icon_html = re.sub(r"\sclass=\s*'([^']+)'", lambda m: f" class='{m.group(1)} toc-icon'", icon_html)
748 else:
749 # inject class and inline style before the closing >
750 icon_html = icon_html.rstrip('>') + ' class="toc-icon" style="width:1rem;height:1rem;object-fit:contain;vertical-align:middle;margin-right:0.35rem;">'
752 # Derive display text: strip HTML, or use img alt, or fallback to id
753 display_text = re.sub(r"<[^>]+>", "", text).strip()
754 if not display_text:
755 m = re.search(r'<img[^>]*alt=["\']([^"\']+)["\']', text)
756 if m:
757 display_text = m.group(1).strip()
758 else:
759 display_text = heading_id
761 display_text = _html.escape(_html.unescape(display_text))
763 # If no icon and this is a leaf heading, use the default SVG
764 if not icon_html and not has_child:
765 icon_html = default_svg
767 toc_html += f'<li class="list-group-item"><a href="#{heading_id}">{icon_html}{display_text}</a></li>\n'
768 # Close all remaining open lists
769 while open_lists > 0:
770 toc_html += "</ul>"
771 open_lists -= 1
773 toc_html += "</div>"
775 return toc_html