Coverage for website / builder / markdown.py: 85%
338 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-06-11 09:34 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-06-11 09:34 +0000
1"""
2Markdown Processing - Markdown-to-HTML Conversion.
4This module handles markdown processing, HTML conversion,
5and content formatting for the website builder.
6"""
8import re
11class MarkdownProcessor:
12 """Handles markdown processing and HTML conversion."""
14 def markdown_to_html(
15 self, markdown_content: str, source_file: str = "", output_file: str = ""
16 ) -> str:
17 """Convert markdown to HTML with Bootstrap styling."""
18 # Normalize empty/whitespace-only content consistently across code paths
19 if not markdown_content.strip():
20 return ""
21 try:
22 import markdown
24 md = markdown.Markdown(
25 extensions=[
26 # Supports fenced code blocks reliably inside list items.
27 "pymdownx.superfences",
28 "fenced_code",
29 "codehilite",
30 "tables",
31 "toc",
32 "attr_list",
33 "def_list",
34 "footnotes",
35 "md_in_html",
36 "sane_lists",
37 ],
38 extension_configs={
39 "pymdownx.superfences": {
40 "custom_fences": [] # Disable custom fences that might use Pygments
41 },
42 "codehilite": {
43 "css_class": "codehilite",
44 "use_pygments": False, # Use simple highlighting without Pygments
45 "guess_lang": True,
46 },
47 },
48 )
49 html = md.convert(markdown_content)
51 # Fix any remaining malformed code blocks
52 html = self.fix_malformed_code_blocks(html)
54 # Add Bootstrap classes
55 html = self.add_bootstrap_classes(html)
57 # Render GitHub-style task list markers as clickable checkboxes
58 html = self.render_task_list_checkboxes(html)
60 # Ensure heading IDs
61 html = self.ensure_heading_ids(html)
63 return html
65 except ImportError:
66 # Fallback to basic conversion
67 html = self._basic_markdown_to_html_no_regex(markdown_content)
68 # Apply Bootstrap classes to fallback HTML too
69 html = self.add_bootstrap_classes(html)
70 # Render task lists in fallback mode too
71 html = self.render_task_list_checkboxes(html)
72 # Ensure heading IDs
73 html = self.ensure_heading_ids(html)
74 return html
76 def _basic_markdown_to_html_no_regex(self, markdown_content: str) -> str:
77 """Basic markdown to HTML conversion without regex."""
78 content = markdown_content
79 if not content.strip():
80 return ""
82 def transform_inline(text: str) -> str:
83 # Bold (strong) and italics (em)
84 text = re.sub(
85 r"\*\*([^*]+)\*\*", lambda m: f"<strong>{m.group(1)}</strong>", text
86 )
87 text = re.sub(r"\*([^*]+)\*", lambda m: f"<em>{m.group(1)}</em>", text)
88 # Inline code
89 text = re.sub(r"`([^`]+)`", lambda m: f"<code>{m.group(1)}</code>", text)
90 # Links [text](url)
91 text = re.sub(
92 r"\[([^\]]+)\]\(([^)]+)\)",
93 lambda m: f'<a href="{m.group(2)}">{m.group(1)}</a>',
94 text,
95 )
96 return text
98 lines = content.split("\n")
99 html_lines: list[str] = []
100 in_code_block = False
101 in_list = False
103 for line in lines:
104 raw = line.rstrip("\n")
105 stripped = raw.lstrip()
106 if stripped.startswith("```"):
107 if in_code_block:
108 html_lines.append("</code></pre>")
109 in_code_block = False
110 else:
111 # close any open list before starting code block
112 if in_list:
113 html_lines.append("</ul>")
114 in_list = False
115 html_lines.append("<pre><code>")
116 in_code_block = True
117 continue
119 if in_code_block:
120 html_lines.append(raw)
121 continue
123 # Headings
124 if raw.startswith("# "):
125 if in_list:
126 html_lines.append("</ul>")
127 in_list = False
128 html_lines.append(f"<h1>{transform_inline(raw[2:])}</h1>")
129 continue
130 if raw.startswith("## "):
131 if in_list:
132 html_lines.append("</ul>")
133 in_list = False
134 html_lines.append(f"<h2>{transform_inline(raw[3:])}</h2>")
135 continue
136 if raw.startswith("### "):
137 if in_list:
138 html_lines.append("</ul>")
139 in_list = False
140 html_lines.append(f"<h3>{transform_inline(raw[4:])}</h3>")
141 continue
142 if raw.startswith("#### "):
143 if in_list:
144 html_lines.append("</ul>")
145 in_list = False
146 html_lines.append(f"<h4>{transform_inline(raw[5:])}</h4>")
147 continue
148 if raw.startswith("##### "):
149 if in_list:
150 html_lines.append("</ul>")
151 in_list = False
152 html_lines.append(f"<h5>{transform_inline(raw[6:])}</h5>")
153 continue
154 if raw.startswith("###### "):
155 if in_list:
156 html_lines.append("</ul>")
157 in_list = False
158 html_lines.append(f"<h6>{transform_inline(raw[7:])}</h6>")
159 continue
161 # Lists
162 if raw.lstrip().startswith("- "):
163 if not in_list:
164 html_lines.append("<ul>")
165 in_list = True
166 item_text = raw.lstrip()[2:]
167 html_lines.append(f"<li>{transform_inline(item_text)}</li>")
168 continue
169 else:
170 if in_list and raw.strip() == "":
171 html_lines.append("</ul>")
172 in_list = False
174 # Paragraphs
175 if raw.strip():
176 html_lines.append(f"<p>{transform_inline(raw)}</p>")
178 # Close any open list
179 if in_list:
180 html_lines.append("</ul>")
182 # Join and strip extraneous blank lines
183 html = "\n".join([h for h in html_lines if h is not None])
184 # Apply Bootstrap classes and heading IDs
185 return html
187 def fix_malformed_code_blocks(self, html_content: str) -> str:
188 """Fix code blocks that weren't properly converted by markdown."""
190 # Fix single-line code snippets that should be code blocks
191 # Convert paragraphs with inline code containing bash commands to proper code blocks
192 html_content = re.sub(
193 r'<p><code class="inline-code">(bash|sh)\s*\n\s*([^<]+)</code></p>',
194 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-\1">\2</code></pre></div>',
195 html_content,
196 )
198 # Fix paragraphs with bash/shell commands (with or without language prefix)
199 html_content = re.sub(
200 r'<p><code class="inline-code">(?:bash\s*\n\s*)?([^<]*(?:mkdir|cd|pip|uv|qdrant-loader|mcp-)[^<]*)</code></p>',
201 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-bash">\1</code></pre></div>',
202 html_content,
203 )
205 # Also handle cases where there's no class attribute
206 html_content = re.sub(
207 r"<p><code>(?:bash\s*\n\s*)?([^<]*(?:mkdir|cd|pip|uv|qdrant-loader|mcp-)[^<]*)</code></p>",
208 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-bash">\1</code></pre></div>',
209 html_content,
210 )
212 # Clean up stray <p> tags inside code blocks
213 html_content = re.sub(
214 r"(<code[^>]*>.*?)</p>\s*<p>(.*?</code>)",
215 r"\1\n\2",
216 html_content,
217 flags=re.DOTALL,
218 )
220 # Fix paragraphs that contain triple backticks (malformed code blocks)
221 def fix_code_block(match):
222 content = match.group(1)
223 # Extract language if present
224 lines = content.split("\n")
225 first_line = lines[0].strip()
226 if first_line.startswith("```"):
227 language = first_line[3:].strip()
228 code_content = "\n".join(lines[1:])
229 # Remove trailing ``` if present
230 if code_content.endswith("```"):
231 code_content = code_content[:-3].rstrip()
232 return f'<div class="code-block-wrapper"><pre class="code-block"><code class="language-{language}">{code_content}</code></pre></div>'
233 return match.group(0)
235 # Match paragraphs containing code blocks
236 html_content = re.sub(
237 r"<p>(```[^`]*```)</p>", fix_code_block, html_content, flags=re.DOTALL
238 )
240 # Handle multi-paragraph code blocks
241 html_content = re.sub(
242 r"<p>```(\w+)\s*</p>\s*<p>(.*?)</p>\s*<p>```</p>",
243 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-\1">\2</code></pre></div>',
244 html_content,
245 flags=re.DOTALL,
246 )
248 # Handle code blocks split across multiple paragraphs
249 html_content = re.sub(
250 r"<p>```(\w+)?\s*(.*?)\s*```</p>",
251 lambda m: f'<div class="code-block-wrapper"><pre class="code-block"><code class="language-{m.group(1) or ""}">{m.group(2)}</code></pre></div>',
252 html_content,
253 flags=re.DOTALL,
254 )
256 return html_content
258 def ensure_heading_ids(self, html_content: str) -> str:
259 """Ensure all headings have IDs for anchor links."""
261 def slugify(text: str) -> str:
262 """Convert text to URL-safe slug."""
263 import re
265 slug = re.sub(r"[^\w\s-]", "", text.lower())
266 return re.sub(r"[-\s]+", "-", slug).strip("-")
268 def _extract_text(html: str) -> str:
269 """Return visible text for a piece of HTML (fall back to img alt)."""
270 # Remove tags to get visible text
271 text_only = re.sub(r"<[^>]+>", "", html).strip()
272 if text_only:
273 return text_only
274 # If no visible text, try to get alt from first <img>
275 m = re.search(r'<img[^>]*alt=["\']([^"\']+)["\']', html)
276 if m:
277 return m.group(1).strip()
278 return ""
280 def add_id(match: re.Match) -> str:
281 """Add ID to heading if not present."""
282 tag = match.group(1)
283 attrs = match.group(2) or ""
284 content = match.group(3) or ""
286 if "id=" not in attrs:
287 visible = _extract_text(content)
288 heading_id = slugify(visible or content)
289 if attrs:
290 attrs = f' id="{heading_id}" {attrs.strip()}'
291 else:
292 attrs = f' id="{heading_id}"'
294 return f"<{tag}{attrs}>{content}</{tag}>"
296 # Match headings even when they contain HTML inside
297 heading_pattern = r"<(h[1-6])([^>]*)>(.*?)</h[1-6]>"
298 return re.sub(heading_pattern, add_id, html_content, flags=re.DOTALL)
300 def add_bootstrap_classes(self, html_content: str) -> str:
301 """Add Bootstrap classes to HTML elements."""
303 # Add Bootstrap header classes
304 html_content = re.sub(
305 r"<h1([^>]*)>",
306 r'<h1\1 class="display-4 fw-bold text-primary mb-4">',
307 html_content,
308 )
309 html_content = re.sub(
310 r"<h2([^>]*)>",
311 r'<h2\1 class="h2 fw-bold text-primary">',
312 html_content,
313 )
314 html_content = re.sub(
315 r"<h3([^>]*)>",
316 r'<h3\1 class="h3 fw-bold text-primary">',
317 html_content,
318 )
319 html_content = re.sub(
320 r"<h4([^>]*)>", r'<h4\1 class="h4 fw-bold">', html_content
321 )
322 html_content = re.sub(
323 r"<h5([^>]*)>", r'<h5\1 class="h5 fw-bold">', html_content
324 )
325 html_content = re.sub(
326 r"<h6([^>]*)>", r'<h6\1 class="h6 fw-semibold">', html_content
327 )
329 # Add Bootstrap code block classes - clean approach
330 # First handle codehilite divs
331 html_content = re.sub(
332 r'<div class="codehilite">',
333 '<div class="code-block-wrapper">',
334 html_content,
335 )
337 # Handle standalone pre blocks (not already in wrappers)
338 html_content = re.sub(
339 r'(?<!<div class="code-block-wrapper">)<pre>',
340 '<div class="code-block-wrapper"><pre class="code-block">',
341 html_content,
342 )
344 # Add code-block class to pre tags that don't have it
345 html_content = re.sub(
346 r'<pre(?![^>]*class="code-block")([^>]*)>',
347 r'<pre class="code-block"\1>',
348 html_content,
349 )
351 # Close wrapper divs only for pre blocks that we wrapped
352 html_content = re.sub(
353 r'(<div class="code-block-wrapper"><pre class="code-block"[^>]*>.*?)</pre>(?!</div>)',
354 r"\1</pre></div>",
355 html_content,
356 flags=re.DOTALL,
357 )
358 # Add Bootstrap inline code classes
359 # First handle code blocks, then inline code
360 html_content = re.sub(
361 r"<code>",
362 '<code class="inline-code">',
363 html_content,
364 )
365 # Override inline-code class for code inside pre blocks
366 html_content = re.sub(
367 r'(<pre[^>]*>.*?)<code class="inline-code">',
368 r"\1<code>",
369 html_content,
370 flags=re.DOTALL,
371 )
373 # Add Bootstrap link classes
374 html_content = re.sub(
375 r'<a([^>]*?)href="([^"]*)"([^>]*?)>',
376 r'<a\1href="\2"\3 class="text-decoration-none">',
377 html_content,
378 )
380 # Add Bootstrap list classes
381 html_content = re.sub(
382 r"<ul>", '<ul class="list-group list-group-flush">', html_content
383 )
384 html_content = re.sub(
385 r"<ol>", '<ol class="list-group list-group-numbered">', html_content
386 )
387 html_content = re.sub(r"<li>", '<li class="list-group-item">', html_content)
389 # Add Bootstrap table classes
390 html_content = re.sub(
391 r"<table>", '<table class="table table-striped table-hover">', html_content
392 )
394 # Add Bootstrap alert classes for blockquotes
395 html_content = re.sub(
396 r"<blockquote>", '<blockquote class="alert alert-info">', html_content
397 )
399 # Add Bootstrap button classes to links that look like buttons
400 html_content = re.sub(
401 r'<a([^>]*?)class="[^"]*btn[^"]*"([^>]*?)>',
402 r'<a\1class="btn btn-primary"\2>',
403 html_content,
404 )
406 return html_content
408 def render_task_list_checkboxes(self, html_content: str) -> str:
409 """Render markdown task-list markers as checkbox inputs."""
411 def add_class(attrs: str, class_name: str) -> str:
412 class_match = re.search(r'class="([^"]*)"', attrs)
413 if class_match:
414 classes = class_match.group(1).split()
415 if class_name not in classes:
416 classes.append(class_name)
417 return re.sub(r'class="([^"]*)"', f'class="{" ".join(classes)}"', attrs)
418 return f'{attrs} class="{class_name}"'
420 def replace_task_item(match: re.Match) -> str:
421 attrs = match.group("attrs") or ""
422 marker = match.group("marker")
423 body = match.group("body")
424 checked_attr = " checked" if marker.lower() == "x" else ""
425 attrs = add_class(attrs, "task-list-item")
426 return (
427 f"<li{attrs}>"
428 f'<input class="form-check-input me-2" type="checkbox"{checked_attr} disabled>'
429 f"{body}</li>"
430 )
432 return re.sub(
433 r"<li(?P<attrs>[^>]*)>\s*\[(?P<marker>[ xX])\]\s*(?P<body>.*?)</li>",
434 replace_task_item,
435 html_content,
436 flags=re.DOTALL,
437 )
439 def extract_title_from_markdown(self, markdown_content: str) -> str:
440 """Extract title from markdown content."""
441 lines = markdown_content.split("\n")
442 for line in lines:
443 line = line.strip()
444 if line.startswith("# "):
445 return line[2:].strip()
446 return "Documentation" # Default fallback title
448 def basic_markdown_to_html(self, markdown_content: str) -> str:
449 """Basic markdown to HTML conversion - alias for compatibility."""
450 return self.markdown_to_html(markdown_content)
452 def convert_markdown_links_to_html(
453 self, content: str, source_file: str = "", target_dir: str = ""
454 ) -> str:
455 """Convert markdown links to HTML format."""
457 # Convert [text](link.md) to [text](link.html) - markdown style
458 def replace_md_links(match):
459 text = match.group(1)
460 link = match.group(2)
461 link = self._process_link_path(link, source_file)
462 return f"[{text}]({link})"
464 # Convert href="link.md" to href="link.html" - HTML style
465 def replace_href_links(match):
466 prefix = match.group(1)
467 link = match.group(2)
468 suffix = match.group(3)
469 link = self._process_link_path(link, source_file)
470 return f"{prefix}{link}{suffix}"
472 # Apply conversions - expanded patterns to catch more file types
473 # Catch .md files and well-known files without extensions
474 well_known_link_pattern_md = (
475 r"\[([^\]]+)\]\(((?:(?:\.\./)+|\./|/)?"
476 r"(?:LICENSE|README|CHANGELOG|CONTRIBUTING)(?:/[^)]*)?(?:#[^)]*)?)\)"
477 )
478 well_known_link_pattern_href = (
479 r'(href=")((?:(?:\.\./)+|\./|/)?'
480 r'(?:LICENSE|README|CHANGELOG|CONTRIBUTING)(?:/[^"]*)?(?:#[^"]*)?)(")'
481 )
483 content = re.sub(
484 r"\[([^\]]+)\]\(([^)]+\.md(?:#[^)]*)?)\)", replace_md_links, content
485 )
486 content = re.sub(
487 well_known_link_pattern_md,
488 replace_md_links,
489 content,
490 )
491 content = re.sub(
492 r'(href=")([^"]+\.md(?:#[^"]*)?)(")', replace_href_links, content
493 )
494 content = re.sub(
495 well_known_link_pattern_href,
496 replace_href_links,
497 content,
498 )
500 # The following normalizations are only applied during site builds (when source_file is provided).
501 # Unit tests expect relative paths to be preserved.
502 if source_file:
503 # Normalize links that incorrectly include an extra "/docs/" prefix inside /docs pages
504 # e.g., href="docs/users/..." when already under /docs/ -> make it absolute "/docs/users/..."
505 content = re.sub(r'(href=")(docs/[^"]+)(")', r"\1/\2\3", content)
506 content = re.sub(r"\]\((docs/[^)]+)\)", r"](/\1)", content)
508 # Collapse accidental duplicate docs/docs prefixes
509 content = re.sub(
510 r'(href=")/?docs/docs/([^"]+)(")', r"\1/docs/\2\3", content
511 )
512 content = re.sub(r"\]\(/?docs/docs/([^\)]+)\)", r"](/docs/\1)", content)
514 # Rewrite relative ./docs/... links to absolute /docs/ (HTML and Markdown)
515 content = re.sub(
516 r'(href=")\./docs/([^"#]*)(#[^"]*)?(")', r"\1/docs/\2\3\4", content
517 )
518 content = re.sub(
519 r"\]\(\./docs/([^\)#]*)(#[^\)]*)?\)", r"](/docs/\1\2)", content
520 )
522 # Rewrite relative ../../docs/... links to absolute /docs/ (HTML and Markdown)
523 content = re.sub(
524 r'(href=")(?:\.{2}/)+docs/([^"#]*)(#[^"]*)?(")',
525 r"\1/docs/\2\3\4",
526 content,
527 )
528 content = re.sub(
529 r"\]\((?:\.{2}/)+docs/([^\)#]*)(#[^\)]*)?\)", r"](/docs/\1\2)", content
530 )
532 # Convert .md (with optional anchors) to .html in both HTML and Markdown links
533 content = re.sub(
534 r'(href=")([^"\s]+)\.md(#[^"]*)?(")',
535 lambda m: f"{m.group(1)}{m.group(2)}.html{m.group(3) or ''}{m.group(4)}",
536 content,
537 )
538 content = re.sub(
539 r"\]\(([^\)\s]+)\.md(#[^\)]*)?\)",
540 lambda m: f"]({m.group(1)}.html{m.group(2) or ''})",
541 content,
542 )
544 # Normalize developers relative links to directory indexes
545 content = re.sub(
546 r'(href=")\./(architecture|testing|deployment|extending)\.html(")',
547 r"\1./\2/\3",
548 content,
549 )
550 # Normalize absolute developers/*.html to directory indexes
551 content = re.sub(
552 r'(href=")([^"\s]*/developers/)(architecture|testing|deployment|extending)\.html(")',
553 r"\1\2\3/\4",
554 content,
555 )
556 content = re.sub(
557 r"\]\(([^\)\s]*/developers/)(architecture|testing|deployment|extending)\.html\)",
558 r"](\1\2/)",
559 content,
560 )
561 # Normalize parent-relative developers links like ../extending.html to ../extending/
562 content = re.sub(
563 r'(href=")([^"#]*/developers/)(architecture|testing|deployment|extending)\.html(#[^"]*)?(")',
564 r"\1\2\3/\4\5",
565 content,
566 )
567 # Normalize sibling links such as ../extending.html -> ../extending/
568 content = re.sub(
569 r'(href=")\.\./(architecture|testing|deployment|extending)\.html(#[^"]*)?(")',
570 r"\1../\2/\3\4",
571 content,
572 )
573 content = re.sub(
574 r"\]\(\.\./(architecture|testing|deployment|extending)\.html(#[^\)]*)?\)",
575 r"](../\1/\2)",
576 content,
577 )
579 # Ensure well-known repo root files under /docs have .html extension
580 content = re.sub(
581 r'(href=")(/docs/(?:LICENSE|README|CHANGELOG|CONTRIBUTING))(#[^"]*)?(")',
582 r"\1\2.html\3\4",
583 content,
584 )
586 # If a target output path is provided, convert absolute /docs/... links to relative ones
587 if target_dir:
588 try:
589 import posixpath
591 base_dir = target_dir
592 if not base_dir.endswith("/"):
593 base_dir = posixpath.dirname(base_dir) + "/"
595 def _to_relative_html(match: re.Match) -> str:
596 prefix, path_part, anchor, suffix = (
597 match.group(1),
598 match.group(2),
599 match.group(3) or "",
600 match.group(4),
601 )
602 abs_path = "docs/" + path_part
603 rel = posixpath.relpath(abs_path, base_dir.rstrip("/"))
604 return f'{prefix}{rel}{anchor or ""}{suffix}'
606 def _to_relative_md(match: re.Match) -> str:
607 path_part, anchor = match.group(1), match.group(2) or ""
608 abs_path = "docs/" + path_part
609 rel = posixpath.relpath(abs_path, base_dir.rstrip("/"))
610 return f"]({rel}{anchor})"
612 content = re.sub(
613 r'(href=")/docs/([^"#]+)(#[^"]*)?(")',
614 _to_relative_html,
615 content,
616 )
617 content = re.sub(
618 r"\]\(/docs/([^\)#]+)(#[^\)]*)?\)", _to_relative_md, content
619 )
620 except Exception:
621 # Fallback silently if relative conversion fails
622 pass
624 return content
626 def _process_link_path(self, link: str, source_file: str = "") -> str:
627 """Process a link path for conversion."""
628 # Preserve anchor fragments while processing
629 anchor = ""
630 if "#" in link:
631 link, anchor = link.split("#", 1)
632 anchor = "#" + anchor
634 # Only rewrite to absolute /docs when building from a source file context
635 if source_file:
636 # ../../docs/... -> /docs/...
637 link = re.sub(r"^(?:\.{2}/)+docs/", "/docs/", link)
638 # ./docs/... -> /docs/...
639 link = re.sub(r"^\./docs/", "/docs/", link)
640 # docs/... (relative) -> /docs/...
641 if link.startswith("docs/"):
642 link = "/" + link
644 # Decide whether to convert .md to .html (preserving anchors)
645 should_convert_md = True
646 if anchor and "/" not in link and not source_file:
647 # Preserve bare filename.md#anchor in tests (no source context)
648 should_convert_md = False
650 if link.endswith(".md") and should_convert_md:
651 link = link[:-3] + ".html"
652 else:
653 # Handle well-known files without extensions
654 filename = link.split("/")[-1]
655 if (
656 filename.upper() in ["LICENSE", "README", "CHANGELOG", "CONTRIBUTING"]
657 and "." not in filename
658 ):
659 # Ensure these resolve under /docs when referenced from packages
660 if (
661 source_file
662 and not link.startswith("/docs/")
663 and filename.upper()
664 in ["LICENSE", "README", "CHANGELOG", "CONTRIBUTING"]
665 ):
666 # Nudge to /docs root for repo-wide files
667 link = "/docs/" + filename
668 link = link + ".html"
670 # Collapse accidental duplicate /docs/docs prefixes
671 link = re.sub(r"^/docs/docs/", "/docs/", link)
672 link = link.replace("docs/docs/", "docs/")
674 # Ensure absolute /docs/ links are normalized (only when building)
675 if source_file and link.startswith("docs/"):
676 link = "/" + link
678 return link + anchor
680 def render_toc(self, html_content: str) -> str:
681 """Generate table of contents from HTML headings."""
682 # Find all headings (capture inner HTML, allow multiline)
683 heading_pattern = r'<(h[1-6])[^>]*id="([^\"]+)"[^>]*>(.*?)</h[1-6]>'
684 headings = re.findall(heading_pattern, html_content, flags=re.DOTALL)
686 if not headings:
687 return ""
689 toc_html = '<div class="toc"><h3>Table of Contents</h3>'
691 # Build hierarchical structure
692 current_level = 0
693 open_lists = 0
695 import html as _html
697 # Default leaf icon (small, neutral color)
698 default_svg = (
699 '<svg class="toc-icon" style="width:1rem;height:1rem;object-fit:contain;vertical-align:middle;margin-right:0.35rem;" '
700 'width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">'
701 '<path d="M10 7.5C9.50555 7.5 9.0222 7.64662 8.61108 7.92133C8.19995 8.19603 7.87952 8.58648 7.6903 9.04329C7.50108 9.50011 7.45157 10.0028 7.54804 10.4877C7.6445 10.9727 7.8826 11.4181 8.23223 11.7678C8.58187 12.1174 9.02732 12.3555 9.51228 12.452C9.99723 12.5484 10.4999 12.4989 10.9567 12.3097C11.4135 12.1205 11.804 11.8 12.0787 11.3889C12.3534 10.9778 12.5 10.4945 12.5 10C12.5 9.33696 12.2366 8.70107 11.7678 8.23223C11.2989 7.76339 10.663 7.5 10 7.5ZM10 11.25C9.75277 11.25 9.5111 11.1767 9.30554 11.0393C9.09998 10.902 8.93976 10.7068 8.84515 10.4784C8.75054 10.2499 8.72579 9.99861 8.77402 9.75614C8.82225 9.51366 8.9413 9.29093 9.11612 9.11612C9.29093 8.9413 9.51366 8.82225 9.75614 8.77402C9.99861 8.72579 10.2499 8.75054 10.4784 8.84515C10.7068 8.93976 10.902 9.09998 11.0393 9.30554C11.1767 9.5111 11.25 9.75277 11.25 10C11.25 10.3315 11.1183 10.6495 10.8839 10.8839C10.6495 11.1183 10.3315 11.25 10 11.25Z" fill="#343330" />'
702 "</svg>"
703 )
705 for idx, (tag, heading_id, text) in enumerate(headings):
706 level = int(tag[1]) # Extract number from h1, h2, etc.
708 # Determine if this heading has child headings (deeper level) before next sibling
709 has_child = False
710 for next_tag, _next_id, _next_text in headings[idx + 1 :]:
711 next_level = int(next_tag[1])
712 if next_level > level:
713 has_child = True
714 break
715 if next_level <= level:
716 break
718 # Handle level changes
719 if level > current_level:
720 # Open new nested lists for deeper levels
721 while current_level < level:
722 if current_level == 0:
723 toc_html += "<ul>"
724 else:
725 toc_html += "<ul>"
726 open_lists += 1
727 current_level += 1
728 elif level < current_level:
729 # Close lists for shallower levels
730 while current_level > level:
731 toc_html += "</ul>"
732 open_lists -= 1
733 current_level -= 1
735 # Extract first <img> if present and sanitize it for TOC display
736 icon_html = ""
737 img_match = re.search(r"(<img[^>]*>)", text, flags=re.DOTALL)
738 if img_match:
739 icon_html = img_match.group(1)
740 # Remove any on* handlers and javascript: hrefs for safety
741 icon_html = re.sub(
742 r"\s(on\w+)\s*=\s*(\"[^\"]*\"|'[^']*')", "", icon_html
743 )
744 icon_html = re.sub(
745 r"javascript:\s*", "", icon_html, flags=re.IGNORECASE
746 )
747 # Remove any existing size/style attributes so we can normalize appearance
748 icon_html = re.sub(
749 r"\s(width|height)=\s*(\"[^\"]*\"|'[^']*')", "", icon_html
750 )
751 icon_html = re.sub(r"\sstyle=\s*(\"[^\"]*\"|'[^']*')", "", icon_html)
752 # Ensure a small consistent size and spacing for TOC icons
753 # Add class toc-icon (append if class exists)
754 if re.search(r"\sclass=\s*\"[^\"]+\"", icon_html):
755 icon_html = re.sub(
756 r"\sclass=\s*\"([^\"]+)\"",
757 lambda m: f' class="{m.group(1)} toc-icon"',
758 icon_html,
759 )
760 elif re.search(r"\sclass=\s*'[^']+'", icon_html):
761 icon_html = re.sub(
762 r"\sclass=\s*'([^']+)'",
763 lambda m: f" class='{m.group(1)} toc-icon'",
764 icon_html,
765 )
766 else:
767 # inject class and inline style before the closing >
768 icon_html = (
769 icon_html.rstrip(">")
770 + ' class="toc-icon" style="width:1rem;height:1rem;object-fit:contain;vertical-align:middle;margin-right:0.35rem;">'
771 )
773 # Derive display text: strip HTML, or use img alt, or fallback to id
774 display_text = re.sub(r"<[^>]+>", "", text).strip()
775 if not display_text:
776 m = re.search(r'<img[^>]*alt=["\']([^"\']+)["\']', text)
777 if m:
778 display_text = m.group(1).strip()
779 else:
780 display_text = heading_id
782 display_text = _html.escape(_html.unescape(display_text))
784 # If no icon and this is a level-3 leaf heading (and not already a numbered item), use the default SVG
785 starts_with_number = bool(re.match(r"^\d+\.", display_text))
786 if (
787 not icon_html
788 and not has_child
789 and level == 3
790 and not starts_with_number
791 ):
792 icon_html = default_svg
794 toc_html += f'<li class="list-group-item"><a href="#{heading_id}">{icon_html}{display_text}</a></li>\n'
795 # Close all remaining open lists
796 while open_lists > 0:
797 toc_html += "</ul>"
798 open_lists -= 1
800 toc_html += "</div>"
802 return toc_html