Coverage for website / builder / markdown.py: 85%

336 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-10 09:37 +0000

1""" 

2Markdown Processing - Markdown-to-HTML Conversion. 

3 

4This module handles markdown processing, HTML conversion, 

5and content formatting for the website builder. 

6""" 

7 

8import re 

9 

10 

11class MarkdownProcessor: 

12 """Handles markdown processing and HTML conversion.""" 

13 

14 def markdown_to_html( 

15 self, markdown_content: str, source_file: str = "", output_file: str = "" 

16 ) -> str: 

17 """Convert markdown to HTML with Bootstrap styling.""" 

18 # Normalize empty/whitespace-only content consistently across code paths 

19 if not markdown_content.strip(): 

20 return "" 

21 try: 

22 import markdown 

23 

24 md = markdown.Markdown( 

25 extensions=[ 

26 "fenced_code", 

27 "codehilite", 

28 "tables", 

29 "toc", 

30 "attr_list", 

31 "def_list", 

32 "footnotes", 

33 "md_in_html", 

34 "sane_lists", 

35 ], 

36 extension_configs={ 

37 "codehilite": { 

38 "css_class": "codehilite", 

39 "use_pygments": False, # Use simple highlighting without Pygments 

40 "guess_lang": True, 

41 } 

42 }, 

43 ) 

44 html = md.convert(markdown_content) 

45 

46 # Fix any remaining malformed code blocks 

47 html = self.fix_malformed_code_blocks(html) 

48 

49 # Add Bootstrap classes 

50 html = self.add_bootstrap_classes(html) 

51 

52 # Render GitHub-style task list markers as clickable checkboxes 

53 html = self.render_task_list_checkboxes(html) 

54 

55 # Ensure heading IDs 

56 html = self.ensure_heading_ids(html) 

57 

58 return html 

59 

60 except ImportError: 

61 # Fallback to basic conversion 

62 html = self._basic_markdown_to_html_no_regex(markdown_content) 

63 # Apply Bootstrap classes to fallback HTML too 

64 html = self.add_bootstrap_classes(html) 

65 # Render task lists in fallback mode too 

66 html = self.render_task_list_checkboxes(html) 

67 # Ensure heading IDs 

68 html = self.ensure_heading_ids(html) 

69 return html 

70 

71 def _basic_markdown_to_html_no_regex(self, markdown_content: str) -> str: 

72 """Basic markdown to HTML conversion without regex.""" 

73 content = markdown_content 

74 if not content.strip(): 

75 return "" 

76 

77 def transform_inline(text: str) -> str: 

78 # Bold (strong) and italics (em) 

79 text = re.sub( 

80 r"\*\*([^*]+)\*\*", lambda m: f"<strong>{m.group(1)}</strong>", text 

81 ) 

82 text = re.sub(r"\*([^*]+)\*", lambda m: f"<em>{m.group(1)}</em>", text) 

83 # Inline code 

84 text = re.sub(r"`([^`]+)`", lambda m: f"<code>{m.group(1)}</code>", text) 

85 # Links [text](url) 

86 text = re.sub( 

87 r"\[([^\]]+)\]\(([^)]+)\)", 

88 lambda m: f'<a href="{m.group(2)}">{m.group(1)}</a>', 

89 text, 

90 ) 

91 return text 

92 

93 lines = content.split("\n") 

94 html_lines: list[str] = [] 

95 in_code_block = False 

96 in_list = False 

97 

98 for line in lines: 

99 raw = line.rstrip("\n") 

100 if raw.startswith("```"): 

101 if in_code_block: 

102 html_lines.append("</code></pre>") 

103 in_code_block = False 

104 else: 

105 # close any open list before starting code block 

106 if in_list: 

107 html_lines.append("</ul>") 

108 in_list = False 

109 html_lines.append("<pre><code>") 

110 in_code_block = True 

111 continue 

112 

113 if in_code_block: 

114 html_lines.append(raw) 

115 continue 

116 

117 # Headings 

118 if raw.startswith("# "): 

119 if in_list: 

120 html_lines.append("</ul>") 

121 in_list = False 

122 html_lines.append(f"<h1>{transform_inline(raw[2:])}</h1>") 

123 continue 

124 if raw.startswith("## "): 

125 if in_list: 

126 html_lines.append("</ul>") 

127 in_list = False 

128 html_lines.append(f"<h2>{transform_inline(raw[3:])}</h2>") 

129 continue 

130 if raw.startswith("### "): 

131 if in_list: 

132 html_lines.append("</ul>") 

133 in_list = False 

134 html_lines.append(f"<h3>{transform_inline(raw[4:])}</h3>") 

135 continue 

136 if raw.startswith("#### "): 

137 if in_list: 

138 html_lines.append("</ul>") 

139 in_list = False 

140 html_lines.append(f"<h4>{transform_inline(raw[5:])}</h4>") 

141 continue 

142 if raw.startswith("##### "): 

143 if in_list: 

144 html_lines.append("</ul>") 

145 in_list = False 

146 html_lines.append(f"<h5>{transform_inline(raw[6:])}</h5>") 

147 continue 

148 if raw.startswith("###### "): 

149 if in_list: 

150 html_lines.append("</ul>") 

151 in_list = False 

152 html_lines.append(f"<h6>{transform_inline(raw[7:])}</h6>") 

153 continue 

154 

155 # Lists 

156 if raw.lstrip().startswith("- "): 

157 if not in_list: 

158 html_lines.append("<ul>") 

159 in_list = True 

160 item_text = raw.lstrip()[2:] 

161 html_lines.append(f"<li>{transform_inline(item_text)}</li>") 

162 continue 

163 else: 

164 if in_list and raw.strip() == "": 

165 html_lines.append("</ul>") 

166 in_list = False 

167 

168 # Paragraphs 

169 if raw.strip(): 

170 html_lines.append(f"<p>{transform_inline(raw)}</p>") 

171 

172 # Close any open list 

173 if in_list: 

174 html_lines.append("</ul>") 

175 

176 # Join and strip extraneous blank lines 

177 html = "\n".join([h for h in html_lines if h is not None]) 

178 # Apply Bootstrap classes and heading IDs 

179 return html 

180 

181 def fix_malformed_code_blocks(self, html_content: str) -> str: 

182 """Fix code blocks that weren't properly converted by markdown.""" 

183 

184 # Fix single-line code snippets that should be code blocks 

185 # Convert paragraphs with inline code containing bash commands to proper code blocks 

186 html_content = re.sub( 

187 r'<p><code class="inline-code">(bash|sh)\s*\n\s*([^<]+)</code></p>', 

188 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-\1">\2</code></pre></div>', 

189 html_content, 

190 ) 

191 

192 # Fix paragraphs with bash/shell commands (with or without language prefix) 

193 html_content = re.sub( 

194 r'<p><code class="inline-code">(?:bash\s*\n\s*)?([^<]*(?:mkdir|cd|pip|qdrant-loader|mcp-)[^<]*)</code></p>', 

195 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-bash">\1</code></pre></div>', 

196 html_content, 

197 ) 

198 

199 # Also handle cases where there's no class attribute 

200 html_content = re.sub( 

201 r"<p><code>(?:bash\s*\n\s*)?([^<]*(?:mkdir|cd|pip|qdrant-loader|mcp-)[^<]*)</code></p>", 

202 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-bash">\1</code></pre></div>', 

203 html_content, 

204 ) 

205 

206 # Clean up stray <p> tags inside code blocks 

207 html_content = re.sub( 

208 r"(<code[^>]*>.*?)</p>\s*<p>(.*?</code>)", 

209 r"\1\n\2", 

210 html_content, 

211 flags=re.DOTALL, 

212 ) 

213 

214 # Fix paragraphs that contain triple backticks (malformed code blocks) 

215 def fix_code_block(match): 

216 content = match.group(1) 

217 # Extract language if present 

218 lines = content.split("\n") 

219 first_line = lines[0].strip() 

220 if first_line.startswith("```"): 

221 language = first_line[3:].strip() 

222 code_content = "\n".join(lines[1:]) 

223 # Remove trailing ``` if present 

224 if code_content.endswith("```"): 

225 code_content = code_content[:-3].rstrip() 

226 return f'<div class="code-block-wrapper"><pre class="code-block"><code class="language-{language}">{code_content}</code></pre></div>' 

227 return match.group(0) 

228 

229 # Match paragraphs containing code blocks 

230 html_content = re.sub( 

231 r"<p>(```[^`]*```)</p>", fix_code_block, html_content, flags=re.DOTALL 

232 ) 

233 

234 # Handle multi-paragraph code blocks 

235 html_content = re.sub( 

236 r"<p>```(\w+)\s*</p>\s*<p>(.*?)</p>\s*<p>```</p>", 

237 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-\1">\2</code></pre></div>', 

238 html_content, 

239 flags=re.DOTALL, 

240 ) 

241 

242 # Handle code blocks split across multiple paragraphs 

243 html_content = re.sub( 

244 r"<p>```(\w+)?\s*(.*?)\s*```</p>", 

245 lambda m: f'<div class="code-block-wrapper"><pre class="code-block"><code class="language-{m.group(1) or ""}">{m.group(2)}</code></pre></div>', 

246 html_content, 

247 flags=re.DOTALL, 

248 ) 

249 

250 return html_content 

251 

252 def ensure_heading_ids(self, html_content: str) -> str: 

253 """Ensure all headings have IDs for anchor links.""" 

254 

255 def slugify(text: str) -> str: 

256 """Convert text to URL-safe slug.""" 

257 import re 

258 

259 slug = re.sub(r"[^\w\s-]", "", text.lower()) 

260 return re.sub(r"[-\s]+", "-", slug).strip("-") 

261 

262 def _extract_text(html: str) -> str: 

263 """Return visible text for a piece of HTML (fall back to img alt).""" 

264 # Remove tags to get visible text 

265 text_only = re.sub(r"<[^>]+>", "", html).strip() 

266 if text_only: 

267 return text_only 

268 # If no visible text, try to get alt from first <img> 

269 m = re.search(r'<img[^>]*alt=["\']([^"\']+)["\']', html) 

270 if m: 

271 return m.group(1).strip() 

272 return "" 

273 

274 def add_id(match: re.Match) -> str: 

275 """Add ID to heading if not present.""" 

276 tag = match.group(1) 

277 attrs = match.group(2) or "" 

278 content = match.group(3) or "" 

279 

280 if "id=" not in attrs: 

281 visible = _extract_text(content) 

282 heading_id = slugify(visible or content) 

283 if attrs: 

284 attrs = f' id="{heading_id}" {attrs.strip()}' 

285 else: 

286 attrs = f' id="{heading_id}"' 

287 

288 return f"<{tag}{attrs}>{content}</{tag}>" 

289 

290 # Match headings even when they contain HTML inside 

291 heading_pattern = r"<(h[1-6])([^>]*)>(.*?)</h[1-6]>" 

292 return re.sub(heading_pattern, add_id, html_content, flags=re.DOTALL) 

293 

294 def add_bootstrap_classes(self, html_content: str) -> str: 

295 """Add Bootstrap classes to HTML elements.""" 

296 

297 # Add Bootstrap header classes 

298 html_content = re.sub( 

299 r"<h1([^>]*)>", 

300 r'<h1\1 class="display-4 fw-bold text-primary mb-4">', 

301 html_content, 

302 ) 

303 html_content = re.sub( 

304 r"<h2([^>]*)>", 

305 r'<h2\1 class="h2 fw-bold text-primary mt-5 mb-3">', 

306 html_content, 

307 ) 

308 html_content = re.sub( 

309 r"<h3([^>]*)>", 

310 r'<h3\1 class="h3 fw-bold text-primary mt-5 mb-3">', 

311 html_content, 

312 ) 

313 html_content = re.sub( 

314 r"<h4([^>]*)>", r'<h4\1 class="h4 fw-bold mt-4 mb-3">', html_content 

315 ) 

316 html_content = re.sub( 

317 r"<h5([^>]*)>", r'<h5\1 class="h5 fw-bold mt-3 mb-2">', html_content 

318 ) 

319 html_content = re.sub( 

320 r"<h6([^>]*)>", r'<h6\1 class="h6 fw-semibold mt-2 mb-1">', html_content 

321 ) 

322 

323 # Add Bootstrap code block classes - clean approach 

324 # First handle codehilite divs 

325 html_content = re.sub( 

326 r'<div class="codehilite">', 

327 '<div class="code-block-wrapper">', 

328 html_content, 

329 ) 

330 

331 # Handle standalone pre blocks (not already in wrappers) 

332 html_content = re.sub( 

333 r'(?<!<div class="code-block-wrapper">)<pre>', 

334 '<div class="code-block-wrapper"><pre class="code-block">', 

335 html_content, 

336 ) 

337 

338 # Add code-block class to pre tags that don't have it 

339 html_content = re.sub( 

340 r'<pre(?![^>]*class="code-block")([^>]*)>', 

341 r'<pre class="code-block"\1>', 

342 html_content, 

343 ) 

344 

345 # Close wrapper divs only for pre blocks that we wrapped 

346 html_content = re.sub( 

347 r'(<div class="code-block-wrapper"><pre class="code-block"[^>]*>.*?)</pre>(?!</div>)', 

348 r"\1</pre></div>", 

349 html_content, 

350 flags=re.DOTALL, 

351 ) 

352 # Add Bootstrap inline code classes 

353 # First handle code blocks, then inline code 

354 html_content = re.sub( 

355 r"<code>", 

356 '<code class="inline-code">', 

357 html_content, 

358 ) 

359 # Override inline-code class for code inside pre blocks 

360 html_content = re.sub( 

361 r'(<pre[^>]*>.*?)<code class="inline-code">', 

362 r"\1<code>", 

363 html_content, 

364 flags=re.DOTALL, 

365 ) 

366 

367 # Add Bootstrap link classes 

368 html_content = re.sub( 

369 r'<a([^>]*?)href="([^"]*)"([^>]*?)>', 

370 r'<a\1href="\2"\3 class="text-decoration-none">', 

371 html_content, 

372 ) 

373 

374 # Add Bootstrap list classes 

375 html_content = re.sub( 

376 r"<ul>", '<ul class="list-group list-group-flush">', html_content 

377 ) 

378 html_content = re.sub( 

379 r"<ol>", '<ol class="list-group list-group-numbered">', html_content 

380 ) 

381 html_content = re.sub(r"<li>", '<li class="list-group-item">', html_content) 

382 

383 # Add Bootstrap table classes 

384 html_content = re.sub( 

385 r"<table>", '<table class="table table-striped table-hover">', html_content 

386 ) 

387 

388 # Add Bootstrap alert classes for blockquotes 

389 html_content = re.sub( 

390 r"<blockquote>", '<blockquote class="alert alert-info">', html_content 

391 ) 

392 

393 # Add Bootstrap button classes to links that look like buttons 

394 html_content = re.sub( 

395 r'<a([^>]*?)class="[^"]*btn[^"]*"([^>]*?)>', 

396 r'<a\1class="btn btn-primary"\2>', 

397 html_content, 

398 ) 

399 

400 return html_content 

401 

402 def render_task_list_checkboxes(self, html_content: str) -> str: 

403 """Render markdown task-list markers as checkbox inputs.""" 

404 

405 def add_class(attrs: str, class_name: str) -> str: 

406 class_match = re.search(r'class="([^"]*)"', attrs) 

407 if class_match: 

408 classes = class_match.group(1).split() 

409 if class_name not in classes: 

410 classes.append(class_name) 

411 return re.sub( 

412 r'class="([^"]*)"', f'class="{" ".join(classes)}"', attrs 

413 ) 

414 return f'{attrs} class="{class_name}"' 

415 

416 def replace_task_item(match: re.Match) -> str: 

417 attrs = match.group("attrs") or "" 

418 marker = match.group("marker") 

419 body = match.group("body") 

420 checked_attr = " checked" if marker.lower() == "x" else "" 

421 attrs = add_class(attrs, "task-list-item") 

422 return ( 

423 f'<li{attrs}>' 

424 f'<input class="form-check-input me-2" type="checkbox"{checked_attr} disabled>' 

425 f"{body}</li>" 

426 ) 

427 

428 return re.sub( 

429 r"<li(?P<attrs>[^>]*)>\s*\[(?P<marker>[ xX])\]\s*(?P<body>.*?)</li>", 

430 replace_task_item, 

431 html_content, 

432 flags=re.DOTALL, 

433 ) 

434 

435 def extract_title_from_markdown(self, markdown_content: str) -> str: 

436 """Extract title from markdown content.""" 

437 lines = markdown_content.split("\n") 

438 for line in lines: 

439 line = line.strip() 

440 if line.startswith("# "): 

441 return line[2:].strip() 

442 return "Documentation" # Default fallback title 

443 

444 def basic_markdown_to_html(self, markdown_content: str) -> str: 

445 """Basic markdown to HTML conversion - alias for compatibility.""" 

446 return self.markdown_to_html(markdown_content) 

447 

448 def convert_markdown_links_to_html( 

449 self, content: str, source_file: str = "", target_dir: str = "" 

450 ) -> str: 

451 """Convert markdown links to HTML format.""" 

452 

453 # Convert [text](link.md) to [text](link.html) - markdown style 

454 def replace_md_links(match): 

455 text = match.group(1) 

456 link = match.group(2) 

457 link = self._process_link_path(link, source_file) 

458 return f"[{text}]({link})" 

459 

460 # Convert href="link.md" to href="link.html" - HTML style 

461 def replace_href_links(match): 

462 prefix = match.group(1) 

463 link = match.group(2) 

464 suffix = match.group(3) 

465 link = self._process_link_path(link, source_file) 

466 return f"{prefix}{link}{suffix}" 

467 

468 # Apply conversions - expanded patterns to catch more file types 

469 # Catch .md files and well-known files without extensions 

470 well_known_link_pattern_md = ( 

471 r"\[([^\]]+)\]\(((?:(?:\.\./)+|\./|/)?" 

472 r"(?:LICENSE|README|CHANGELOG|CONTRIBUTING)(?:/[^)]*)?(?:#[^)]*)?)\)" 

473 ) 

474 well_known_link_pattern_href = ( 

475 r'(href=")((?:(?:\.\./)+|\./|/)?' 

476 r'(?:LICENSE|README|CHANGELOG|CONTRIBUTING)(?:/[^"]*)?(?:#[^"]*)?)(")' 

477 ) 

478 

479 content = re.sub( 

480 r"\[([^\]]+)\]\(([^)]+\.md(?:#[^)]*)?)\)", replace_md_links, content 

481 ) 

482 content = re.sub( 

483 well_known_link_pattern_md, 

484 replace_md_links, 

485 content, 

486 ) 

487 content = re.sub( 

488 r'(href=")([^"]+\.md(?:#[^"]*)?)(")', replace_href_links, content 

489 ) 

490 content = re.sub( 

491 well_known_link_pattern_href, 

492 replace_href_links, 

493 content, 

494 ) 

495 

496 # The following normalizations are only applied during site builds (when source_file is provided). 

497 # Unit tests expect relative paths to be preserved. 

498 if source_file: 

499 # Normalize links that incorrectly include an extra "/docs/" prefix inside /docs pages 

500 # e.g., href="docs/users/..." when already under /docs/ -> make it absolute "/docs/users/..." 

501 content = re.sub(r'(href=")(docs/[^"]+)(")', r"\1/\2\3", content) 

502 content = re.sub(r"\]\((docs/[^)]+)\)", r"](/\1)", content) 

503 

504 # Collapse accidental duplicate docs/docs prefixes 

505 content = re.sub( 

506 r'(href=")/?docs/docs/([^"]+)(")', r"\1/docs/\2\3", content 

507 ) 

508 content = re.sub(r"\]\(/?docs/docs/([^\)]+)\)", r"](/docs/\1)", content) 

509 

510 # Rewrite relative ./docs/... links to absolute /docs/ (HTML and Markdown) 

511 content = re.sub( 

512 r'(href=")\./docs/([^"#]*)(#[^"]*)?(")', r"\1/docs/\2\3\4", content 

513 ) 

514 content = re.sub( 

515 r"\]\(\./docs/([^\)#]*)(#[^\)]*)?\)", r"](/docs/\1\2)", content 

516 ) 

517 

518 # Rewrite relative ../../docs/... links to absolute /docs/ (HTML and Markdown) 

519 content = re.sub( 

520 r'(href=")(?:\.{2}/)+docs/([^"#]*)(#[^"]*)?(")', 

521 r"\1/docs/\2\3\4", 

522 content, 

523 ) 

524 content = re.sub( 

525 r"\]\((?:\.{2}/)+docs/([^\)#]*)(#[^\)]*)?\)", r"](/docs/\1\2)", content 

526 ) 

527 

528 # Convert .md (with optional anchors) to .html in both HTML and Markdown links 

529 content = re.sub( 

530 r'(href=")([^"\s]+)\.md(#[^"]*)?(")', 

531 lambda m: f"{m.group(1)}{m.group(2)}.html{m.group(3) or ''}{m.group(4)}", 

532 content, 

533 ) 

534 content = re.sub( 

535 r"\]\(([^\)\s]+)\.md(#[^\)]*)?\)", 

536 lambda m: f"]({m.group(1)}.html{m.group(2) or ''})", 

537 content, 

538 ) 

539 

540 # Normalize developers relative links to directory indexes 

541 content = re.sub( 

542 r'(href=")\./(architecture|testing|deployment|extending)\.html(")', 

543 r"\1./\2/\3", 

544 content, 

545 ) 

546 # Normalize absolute developers/*.html to directory indexes 

547 content = re.sub( 

548 r'(href=")([^"\s]*/developers/)(architecture|testing|deployment|extending)\.html(")', 

549 r"\1\2\3/\4", 

550 content, 

551 ) 

552 content = re.sub( 

553 r"\]\(([^\)\s]*/developers/)(architecture|testing|deployment|extending)\.html\)", 

554 r"](\1\2/)", 

555 content, 

556 ) 

557 # Normalize parent-relative developers links like ../extending.html to ../extending/ 

558 content = re.sub( 

559 r'(href=")([^"#]*/developers/)(architecture|testing|deployment|extending)\.html(#[^"]*)?(")', 

560 r"\1\2\3/\4\5", 

561 content, 

562 ) 

563 # Normalize sibling links such as ../extending.html -> ../extending/ 

564 content = re.sub( 

565 r'(href=")\.\./(architecture|testing|deployment|extending)\.html(#[^"]*)?(")', 

566 r"\1../\2/\3\4", 

567 content, 

568 ) 

569 content = re.sub( 

570 r"\]\(\.\./(architecture|testing|deployment|extending)\.html(#[^\)]*)?\)", 

571 r"](../\1/\2)", 

572 content, 

573 ) 

574 

575 # Ensure well-known repo root files under /docs have .html extension 

576 content = re.sub( 

577 r'(href=")(/docs/(?:LICENSE|README|CHANGELOG|CONTRIBUTING))(#[^"]*)?(")', 

578 r"\1\2.html\3\4", 

579 content, 

580 ) 

581 

582 # If a target output path is provided, convert absolute /docs/... links to relative ones 

583 if target_dir: 

584 try: 

585 import posixpath 

586 

587 base_dir = target_dir 

588 if not base_dir.endswith("/"): 

589 base_dir = posixpath.dirname(base_dir) + "/" 

590 

591 def _to_relative_html(match: re.Match) -> str: 

592 prefix, path_part, anchor, suffix = ( 

593 match.group(1), 

594 match.group(2), 

595 match.group(3) or "", 

596 match.group(4), 

597 ) 

598 abs_path = "docs/" + path_part 

599 rel = posixpath.relpath(abs_path, base_dir.rstrip("/")) 

600 return f'{prefix}{rel}{anchor or ""}{suffix}' 

601 

602 def _to_relative_md(match: re.Match) -> str: 

603 path_part, anchor = match.group(1), match.group(2) or "" 

604 abs_path = "docs/" + path_part 

605 rel = posixpath.relpath(abs_path, base_dir.rstrip("/")) 

606 return f"]({rel}{anchor})" 

607 

608 content = re.sub( 

609 r'(href=")/docs/([^"#]+)(#[^"]*)?(")', 

610 _to_relative_html, 

611 content, 

612 ) 

613 content = re.sub( 

614 r"\]\(/docs/([^\)#]+)(#[^\)]*)?\)", _to_relative_md, content 

615 ) 

616 except Exception: 

617 # Fallback silently if relative conversion fails 

618 pass 

619 

620 return content 

621 

622 def _process_link_path(self, link: str, source_file: str = "") -> str: 

623 """Process a link path for conversion.""" 

624 # Preserve anchor fragments while processing 

625 anchor = "" 

626 if "#" in link: 

627 link, anchor = link.split("#", 1) 

628 anchor = "#" + anchor 

629 

630 # Only rewrite to absolute /docs when building from a source file context 

631 if source_file: 

632 # ../../docs/... -> /docs/... 

633 link = re.sub(r"^(?:\.{2}/)+docs/", "/docs/", link) 

634 # ./docs/... -> /docs/... 

635 link = re.sub(r"^\./docs/", "/docs/", link) 

636 # docs/... (relative) -> /docs/... 

637 if link.startswith("docs/"): 

638 link = "/" + link 

639 

640 # Decide whether to convert .md to .html (preserving anchors) 

641 should_convert_md = True 

642 if anchor and "/" not in link and not source_file: 

643 # Preserve bare filename.md#anchor in tests (no source context) 

644 should_convert_md = False 

645 

646 if link.endswith(".md") and should_convert_md: 

647 link = link[:-3] + ".html" 

648 else: 

649 # Handle well-known files without extensions 

650 filename = link.split("/")[-1] 

651 if ( 

652 filename.upper() in ["LICENSE", "README", "CHANGELOG", "CONTRIBUTING"] 

653 and "." not in filename 

654 ): 

655 # Ensure these resolve under /docs when referenced from packages 

656 if ( 

657 source_file 

658 and not link.startswith("/docs/") 

659 and filename.upper() 

660 in ["LICENSE", "README", "CHANGELOG", "CONTRIBUTING"] 

661 ): 

662 # Nudge to /docs root for repo-wide files 

663 link = "/docs/" + filename 

664 link = link + ".html" 

665 

666 # Collapse accidental duplicate /docs/docs prefixes 

667 link = re.sub(r"^/docs/docs/", "/docs/", link) 

668 link = link.replace("docs/docs/", "docs/") 

669 

670 # Ensure absolute /docs/ links are normalized (only when building) 

671 if source_file and link.startswith("docs/"): 

672 link = "/" + link 

673 

674 return link + anchor 

675 

676 def render_toc(self, html_content: str) -> str: 

677 """Generate table of contents from HTML headings.""" 

678 # Find all headings (capture inner HTML, allow multiline) 

679 heading_pattern = r'<(h[1-6])[^>]*id="([^\"]+)"[^>]*>(.*?)</h[1-6]>' 

680 headings = re.findall(heading_pattern, html_content, flags=re.DOTALL) 

681 

682 if not headings: 

683 return "" 

684 

685 toc_html = '<div class="toc"><h3>Table of Contents</h3>' 

686 

687 # Build hierarchical structure 

688 current_level = 0 

689 open_lists = 0 

690 

691 import html as _html 

692 

693 # Default leaf icon (small, neutral color) 

694 default_svg = ( 

695 '<svg class="toc-icon" style="width:1rem;height:1rem;object-fit:contain;vertical-align:middle;margin-right:0.35rem;" ' 

696 'width="20" height="20" viewBox="0 0 20 20" fill="none" xmlns="http://www.w3.org/2000/svg">' 

697 '<path d="M10 7.5C9.50555 7.5 9.0222 7.64662 8.61108 7.92133C8.19995 8.19603 7.87952 8.58648 7.6903 9.04329C7.50108 9.50011 7.45157 10.0028 7.54804 10.4877C7.6445 10.9727 7.8826 11.4181 8.23223 11.7678C8.58187 12.1174 9.02732 12.3555 9.51228 12.452C9.99723 12.5484 10.4999 12.4989 10.9567 12.3097C11.4135 12.1205 11.804 11.8 12.0787 11.3889C12.3534 10.9778 12.5 10.4945 12.5 10C12.5 9.33696 12.2366 8.70107 11.7678 8.23223C11.2989 7.76339 10.663 7.5 10 7.5ZM10 11.25C9.75277 11.25 9.5111 11.1767 9.30554 11.0393C9.09998 10.902 8.93976 10.7068 8.84515 10.4784C8.75054 10.2499 8.72579 9.99861 8.77402 9.75614C8.82225 9.51366 8.9413 9.29093 9.11612 9.11612C9.29093 8.9413 9.51366 8.82225 9.75614 8.77402C9.99861 8.72579 10.2499 8.75054 10.4784 8.84515C10.7068 8.93976 10.902 9.09998 11.0393 9.30554C11.1767 9.5111 11.25 9.75277 11.25 10C11.25 10.3315 11.1183 10.6495 10.8839 10.8839C10.6495 11.1183 10.3315 11.25 10 11.25Z" fill="#343330" />' 

698 '</svg>' 

699 ) 

700 

701 for idx, (tag, heading_id, text) in enumerate(headings): 

702 level = int(tag[1]) # Extract number from h1, h2, etc. 

703 

704 # Determine if this heading has child headings (deeper level) before next sibling 

705 has_child = False 

706 for next_tag, _next_id, _next_text in headings[idx + 1:]: 

707 next_level = int(next_tag[1]) 

708 if next_level > level: 

709 has_child = True 

710 break 

711 if next_level <= level: 

712 break 

713 

714 # Handle level changes 

715 if level > current_level: 

716 # Open new nested lists for deeper levels 

717 while current_level < level: 

718 if current_level == 0: 

719 toc_html += "<ul>" 

720 else: 

721 toc_html += "<ul>" 

722 open_lists += 1 

723 current_level += 1 

724 elif level < current_level: 

725 # Close lists for shallower levels 

726 while current_level > level: 

727 toc_html += "</ul>" 

728 open_lists -= 1 

729 current_level -= 1 

730 

731 # Extract first <img> if present and sanitize it for TOC display 

732 icon_html = "" 

733 img_match = re.search(r'(<img[^>]*>)', text, flags=re.DOTALL) 

734 if img_match: 

735 icon_html = img_match.group(1) 

736 # Remove any on* handlers and javascript: hrefs for safety 

737 icon_html = re.sub(r"\s(on\w+)\s*=\s*(\"[^\"]*\"|'[^']*')", "", icon_html) 

738 icon_html = re.sub(r"javascript:\s*", "", icon_html, flags=re.IGNORECASE) 

739 # Remove any existing size/style attributes so we can normalize appearance 

740 icon_html = re.sub(r"\s(width|height)=\s*(\"[^\"]*\"|'[^']*')", "", icon_html) 

741 icon_html = re.sub(r"\sstyle=\s*(\"[^\"]*\"|'[^']*')", "", icon_html) 

742 # Ensure a small consistent size and spacing for TOC icons 

743 # Add class toc-icon (append if class exists) 

744 if re.search(r"\sclass=\s*\"[^\"]+\"", icon_html): 

745 icon_html = re.sub(r"\sclass=\s*\"([^\"]+)\"", lambda m: f' class="{m.group(1)} toc-icon"', icon_html) 

746 elif re.search(r"\sclass=\s*'[^']+'", icon_html): 

747 icon_html = re.sub(r"\sclass=\s*'([^']+)'", lambda m: f" class='{m.group(1)} toc-icon'", icon_html) 

748 else: 

749 # inject class and inline style before the closing > 

750 icon_html = icon_html.rstrip('>') + ' class="toc-icon" style="width:1rem;height:1rem;object-fit:contain;vertical-align:middle;margin-right:0.35rem;">' 

751 

752 # Derive display text: strip HTML, or use img alt, or fallback to id 

753 display_text = re.sub(r"<[^>]+>", "", text).strip() 

754 if not display_text: 

755 m = re.search(r'<img[^>]*alt=["\']([^"\']+)["\']', text) 

756 if m: 

757 display_text = m.group(1).strip() 

758 else: 

759 display_text = heading_id 

760 

761 display_text = _html.escape(_html.unescape(display_text)) 

762 

763 # If no icon and this is a leaf heading, use the default SVG 

764 if not icon_html and not has_child: 

765 icon_html = default_svg 

766 

767 toc_html += f'<li class="list-group-item"><a href="#{heading_id}">{icon_html}{display_text}</a></li>\n' 

768 # Close all remaining open lists 

769 while open_lists > 0: 

770 toc_html += "</ul>" 

771 open_lists -= 1 

772 

773 toc_html += "</div>" 

774 

775 return toc_html