Coverage for website/builder/markdown.py: 89%
274 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:03 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:03 +0000
1"""
2Markdown Processing - Markdown-to-HTML Conversion.
4This module handles markdown processing, HTML conversion,
5and content formatting for the website builder.
6"""
8import re
11class MarkdownProcessor:
12 """Handles markdown processing and HTML conversion."""
14 def markdown_to_html(
15 self, markdown_content: str, source_file: str = "", output_file: str = ""
16 ) -> str:
17 """Convert markdown to HTML with Bootstrap styling."""
18 # Normalize empty/whitespace-only content consistently across code paths
19 if not markdown_content.strip():
20 return ""
21 try:
22 import markdown
24 md = markdown.Markdown(
25 extensions=[
26 "fenced_code",
27 "codehilite",
28 "tables",
29 "toc",
30 "attr_list",
31 "def_list",
32 "footnotes",
33 "md_in_html",
34 "sane_lists",
35 ],
36 extension_configs={
37 "codehilite": {
38 "css_class": "codehilite",
39 "use_pygments": False, # Use simple highlighting without Pygments
40 "guess_lang": True,
41 }
42 },
43 )
44 html = md.convert(markdown_content)
46 # Fix any remaining malformed code blocks
47 html = self.fix_malformed_code_blocks(html)
49 # Add Bootstrap classes
50 html = self.add_bootstrap_classes(html)
52 # Ensure heading IDs
53 html = self.ensure_heading_ids(html)
55 return html
57 except ImportError:
58 # Fallback to basic conversion
59 html = self._basic_markdown_to_html_no_regex(markdown_content)
60 # Apply Bootstrap classes to fallback HTML too
61 html = self.add_bootstrap_classes(html)
62 # Ensure heading IDs
63 html = self.ensure_heading_ids(html)
64 return html
66 def _basic_markdown_to_html_no_regex(self, markdown_content: str) -> str:
67 """Basic markdown to HTML conversion without regex."""
68 content = markdown_content
69 if not content.strip():
70 return ""
72 def transform_inline(text: str) -> str:
73 # Bold (strong) and italics (em)
74 text = re.sub(
75 r"\*\*([^*]+)\*\*", lambda m: f"<strong>{m.group(1)}</strong>", text
76 )
77 text = re.sub(r"\*([^*]+)\*", lambda m: f"<em>{m.group(1)}</em>", text)
78 # Inline code
79 text = re.sub(r"`([^`]+)`", lambda m: f"<code>{m.group(1)}</code>", text)
80 # Links [text](url)
81 text = re.sub(
82 r"\[([^\]]+)\]\(([^)]+)\)",
83 lambda m: f'<a href="{m.group(2)}">{m.group(1)}</a>',
84 text,
85 )
86 return text
88 lines = content.split("\n")
89 html_lines: list[str] = []
90 in_code_block = False
91 in_list = False
93 for line in lines:
94 raw = line.rstrip("\n")
95 if raw.startswith("```"):
96 if in_code_block:
97 html_lines.append("</code></pre>")
98 in_code_block = False
99 else:
100 # close any open list before starting code block
101 if in_list:
102 html_lines.append("</ul>")
103 in_list = False
104 html_lines.append("<pre><code>")
105 in_code_block = True
106 continue
108 if in_code_block:
109 html_lines.append(raw)
110 continue
112 # Headings
113 if raw.startswith("# "):
114 if in_list:
115 html_lines.append("</ul>")
116 in_list = False
117 html_lines.append(f"<h1>{transform_inline(raw[2:])}</h1>")
118 continue
119 if raw.startswith("## "):
120 if in_list:
121 html_lines.append("</ul>")
122 in_list = False
123 html_lines.append(f"<h2>{transform_inline(raw[3:])}</h2>")
124 continue
125 if raw.startswith("### "):
126 if in_list:
127 html_lines.append("</ul>")
128 in_list = False
129 html_lines.append(f"<h3>{transform_inline(raw[4:])}</h3>")
130 continue
131 if raw.startswith("#### "):
132 if in_list:
133 html_lines.append("</ul>")
134 in_list = False
135 html_lines.append(f"<h4>{transform_inline(raw[5:])}</h4>")
136 continue
137 if raw.startswith("##### "):
138 if in_list:
139 html_lines.append("</ul>")
140 in_list = False
141 html_lines.append(f"<h5>{transform_inline(raw[6:])}</h5>")
142 continue
143 if raw.startswith("###### "):
144 if in_list:
145 html_lines.append("</ul>")
146 in_list = False
147 html_lines.append(f"<h6>{transform_inline(raw[7:])}</h6>")
148 continue
150 # Lists
151 if raw.lstrip().startswith("- "):
152 if not in_list:
153 html_lines.append("<ul>")
154 in_list = True
155 item_text = raw.lstrip()[2:]
156 html_lines.append(f"<li>{transform_inline(item_text)}</li>")
157 continue
158 else:
159 if in_list and raw.strip() == "":
160 html_lines.append("</ul>")
161 in_list = False
163 # Paragraphs
164 if raw.strip():
165 html_lines.append(f"<p>{transform_inline(raw)}</p>")
167 # Close any open list
168 if in_list:
169 html_lines.append("</ul>")
171 # Join and strip extraneous blank lines
172 html = "\n".join([h for h in html_lines if h is not None])
173 # Apply Bootstrap classes and heading IDs
174 return html
176 def fix_malformed_code_blocks(self, html_content: str) -> str:
177 """Fix code blocks that weren't properly converted by markdown."""
179 # Fix single-line code snippets that should be code blocks
180 # Convert paragraphs with inline code containing bash commands to proper code blocks
181 html_content = re.sub(
182 r'<p><code class="inline-code">(bash|sh)\s*\n\s*([^<]+)</code></p>',
183 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-\1">\2</code></pre></div>',
184 html_content,
185 )
187 # Fix paragraphs with bash/shell commands (with or without language prefix)
188 html_content = re.sub(
189 r'<p><code class="inline-code">(?:bash\s*\n\s*)?([^<]*(?:mkdir|cd|pip|qdrant-loader|mcp-)[^<]*)</code></p>',
190 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-bash">\1</code></pre></div>',
191 html_content,
192 )
194 # Also handle cases where there's no class attribute
195 html_content = re.sub(
196 r"<p><code>(?:bash\s*\n\s*)?([^<]*(?:mkdir|cd|pip|qdrant-loader|mcp-)[^<]*)</code></p>",
197 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-bash">\1</code></pre></div>',
198 html_content,
199 )
201 # Clean up stray <p> tags inside code blocks
202 html_content = re.sub(
203 r"(<code[^>]*>.*?)</p>\s*<p>(.*?</code>)",
204 r"\1\n\2",
205 html_content,
206 flags=re.DOTALL,
207 )
209 # Fix paragraphs that contain triple backticks (malformed code blocks)
210 def fix_code_block(match):
211 content = match.group(1)
212 # Extract language if present
213 lines = content.split("\n")
214 first_line = lines[0].strip()
215 if first_line.startswith("```"):
216 language = first_line[3:].strip()
217 code_content = "\n".join(lines[1:])
218 # Remove trailing ``` if present
219 if code_content.endswith("```"):
220 code_content = code_content[:-3].rstrip()
221 return f'<div class="code-block-wrapper"><pre class="code-block"><code class="language-{language}">{code_content}</code></pre></div>'
222 return match.group(0)
224 # Match paragraphs containing code blocks
225 html_content = re.sub(
226 r"<p>(```[^`]*```)</p>", fix_code_block, html_content, flags=re.DOTALL
227 )
229 # Handle multi-paragraph code blocks
230 html_content = re.sub(
231 r"<p>```(\w+)\s*</p>\s*<p>(.*?)</p>\s*<p>```</p>",
232 r'<div class="code-block-wrapper"><pre class="code-block"><code class="language-\1">\2</code></pre></div>',
233 html_content,
234 flags=re.DOTALL,
235 )
237 # Handle code blocks split across multiple paragraphs
238 html_content = re.sub(
239 r"<p>```(\w+)?\s*(.*?)\s*```</p>",
240 lambda m: f'<div class="code-block-wrapper"><pre class="code-block"><code class="language-{m.group(1) or ""}">{m.group(2)}</code></pre></div>',
241 html_content,
242 flags=re.DOTALL,
243 )
245 return html_content
247 def ensure_heading_ids(self, html_content: str) -> str:
248 """Ensure all headings have IDs for anchor links."""
250 def slugify(text: str) -> str:
251 """Convert text to URL-safe slug."""
252 import re
254 slug = re.sub(r"[^\w\s-]", "", text.lower())
255 return re.sub(r"[-\s]+", "-", slug).strip("-")
257 def add_id(match: re.Match) -> str:
258 """Add ID to heading if not present."""
259 tag = match.group(1)
260 attrs = match.group(2) or ""
261 content = match.group(3)
263 if "id=" not in attrs:
264 heading_id = slugify(content)
265 if attrs:
266 attrs = f' id="{heading_id}" {attrs.strip()}'
267 else:
268 attrs = f' id="{heading_id}"'
270 return f"<{tag}{attrs}>{content}</{tag}>"
272 # Add IDs to headings that don't have them
273 heading_pattern = r"<(h[1-6])([^>]*)>([^<]+)</h[1-6]>"
274 return re.sub(heading_pattern, add_id, html_content)
276 def add_bootstrap_classes(self, html_content: str) -> str:
277 """Add Bootstrap classes to HTML elements."""
279 # Add Bootstrap header classes
280 html_content = re.sub(
281 r"<h1([^>]*)>",
282 r'<h1\1 class="display-4 fw-bold text-primary mb-4">',
283 html_content,
284 )
285 html_content = re.sub(
286 r"<h2([^>]*)>",
287 r'<h2\1 class="h2 fw-bold text-primary mt-5 mb-3">',
288 html_content,
289 )
290 html_content = re.sub(
291 r"<h3([^>]*)>",
292 r'<h3\1 class="h3 fw-bold text-primary mt-5 mb-3">',
293 html_content,
294 )
295 html_content = re.sub(
296 r"<h4([^>]*)>", r'<h4\1 class="h4 fw-bold mt-4 mb-3">', html_content
297 )
298 html_content = re.sub(
299 r"<h5([^>]*)>", r'<h5\1 class="h5 fw-bold mt-3 mb-2">', html_content
300 )
301 html_content = re.sub(
302 r"<h6([^>]*)>", r'<h6\1 class="h6 fw-semibold mt-2 mb-1">', html_content
303 )
305 # Add Bootstrap code block classes - clean approach
306 # First handle codehilite divs
307 html_content = re.sub(
308 r'<div class="codehilite">',
309 '<div class="code-block-wrapper">',
310 html_content,
311 )
313 # Handle standalone pre blocks (not already in wrappers)
314 html_content = re.sub(
315 r'(?<!<div class="code-block-wrapper">)<pre>',
316 '<div class="code-block-wrapper"><pre class="code-block">',
317 html_content,
318 )
320 # Add code-block class to pre tags that don't have it
321 html_content = re.sub(
322 r'<pre(?![^>]*class="code-block")([^>]*)>',
323 r'<pre class="code-block"\1>',
324 html_content,
325 )
327 # Close wrapper divs only for pre blocks that we wrapped
328 html_content = re.sub(
329 r'(<div class="code-block-wrapper"><pre class="code-block"[^>]*>.*?)</pre>(?!</div>)',
330 r"\1</pre></div>",
331 html_content,
332 flags=re.DOTALL,
333 )
334 # Add Bootstrap inline code classes
335 # First handle code blocks, then inline code
336 html_content = re.sub(
337 r"<code>",
338 '<code class="inline-code">',
339 html_content,
340 )
341 # Override inline-code class for code inside pre blocks
342 html_content = re.sub(
343 r'(<pre[^>]*>.*?)<code class="inline-code">',
344 r"\1<code>",
345 html_content,
346 flags=re.DOTALL,
347 )
349 # Add Bootstrap link classes
350 html_content = re.sub(
351 r'<a([^>]*?)href="([^"]*)"([^>]*?)>',
352 r'<a\1href="\2"\3 class="text-decoration-none">',
353 html_content,
354 )
356 # Add Bootstrap list classes
357 html_content = re.sub(
358 r"<ul>", '<ul class="list-group list-group-flush">', html_content
359 )
360 html_content = re.sub(
361 r"<ol>", '<ol class="list-group list-group-numbered">', html_content
362 )
363 html_content = re.sub(r"<li>", '<li class="list-group-item">', html_content)
365 # Add Bootstrap table classes
366 html_content = re.sub(
367 r"<table>", '<table class="table table-striped table-hover">', html_content
368 )
370 # Add Bootstrap alert classes for blockquotes
371 html_content = re.sub(
372 r"<blockquote>", '<blockquote class="alert alert-info">', html_content
373 )
375 # Add Bootstrap button classes to links that look like buttons
376 html_content = re.sub(
377 r'<a([^>]*?)class="[^"]*btn[^"]*"([^>]*?)>',
378 r'<a\1class="btn btn-primary"\2>',
379 html_content,
380 )
382 return html_content
384 def extract_title_from_markdown(self, markdown_content: str) -> str:
385 """Extract title from markdown content."""
386 lines = markdown_content.split("\n")
387 for line in lines:
388 line = line.strip()
389 if line.startswith("# "):
390 return line[2:].strip()
391 return "Documentation" # Default fallback title
393 def basic_markdown_to_html(self, markdown_content: str) -> str:
394 """Basic markdown to HTML conversion - alias for compatibility."""
395 return self.markdown_to_html(markdown_content)
397 def convert_markdown_links_to_html(
398 self, content: str, source_file: str = "", target_dir: str = ""
399 ) -> str:
400 """Convert markdown links to HTML format."""
402 # Convert [text](link.md) to [text](link.html) - markdown style
403 def replace_md_links(match):
404 text = match.group(1)
405 link = match.group(2)
406 link = self._process_link_path(link, source_file)
407 return f"[{text}]({link})"
409 # Convert href="link.md" to href="link.html" - HTML style
410 def replace_href_links(match):
411 prefix = match.group(1)
412 link = match.group(2)
413 suffix = match.group(3)
414 link = self._process_link_path(link, source_file)
415 return f"{prefix}{link}{suffix}"
417 # Apply conversions - expanded patterns to catch more file types
418 # Catch .md files and well-known files without extensions
419 content = re.sub(
420 r"\[([^\]]+)\]\(([^)]+\.md(?:#[^)]*)?)\)", replace_md_links, content
421 )
422 content = re.sub(
423 r"\[([^\]]+)\]\(([^)]*(?:LICENSE|README|CHANGELOG|CONTRIBUTING)(?:/[^)]*)?(?:#[^)]*)?)\)",
424 replace_md_links,
425 content,
426 )
427 content = re.sub(
428 r'(href=")([^"]+\.md(?:#[^"]*)?)(")', replace_href_links, content
429 )
430 content = re.sub(
431 r'(href=")([^"]*(?:LICENSE|README|CHANGELOG|CONTRIBUTING)(?:/[^"]*)?(?:#[^"]*)?)(")',
432 replace_href_links,
433 content,
434 )
436 # The following normalizations are only applied during site builds (when source_file is provided).
437 # Unit tests expect relative paths to be preserved.
438 if source_file:
439 # Normalize links that incorrectly include an extra "/docs/" prefix inside /docs pages
440 # e.g., href="docs/users/..." when already under /docs/ -> make it absolute "/docs/users/..."
441 content = re.sub(r'(href=")(docs/[^"]+)(")', r"\1/\2\3", content)
442 content = re.sub(r"\]\((docs/[^)]+)\)", r"](/\1)", content)
444 # Collapse accidental duplicate docs/docs prefixes
445 content = re.sub(
446 r'(href=")/?docs/docs/([^"]+)(")', r"\1/docs/\2\3", content
447 )
448 content = re.sub(r"\]\(/?docs/docs/([^\)]+)\)", r"](/docs/\1)", content)
450 # Rewrite relative ./docs/... links to absolute /docs/ (HTML and Markdown)
451 content = re.sub(
452 r'(href=")\./docs/([^"#]*)(#[^"]*)?(")', r"\1/docs/\2\3\4", content
453 )
454 content = re.sub(
455 r"\]\(\./docs/([^\)#]*)(#[^\)]*)?\)", r"](/docs/\1\2)", content
456 )
458 # Rewrite relative ../../docs/... links to absolute /docs/ (HTML and Markdown)
459 content = re.sub(
460 r'(href=")(?:\.{2}/)+docs/([^"#]*)(#[^"]*)?(")',
461 r"\1/docs/\2\3\4",
462 content,
463 )
464 content = re.sub(
465 r"\]\((?:\.{2}/)+docs/([^\)#]*)(#[^\)]*)?\)", r"](/docs/\1\2)", content
466 )
468 # Convert .md (with optional anchors) to .html in both HTML and Markdown links
469 content = re.sub(
470 r'(href=")([^"\s]+)\.md(#[^"]*)?(")',
471 lambda m: f"{m.group(1)}{m.group(2)}.html{m.group(3) or ''}{m.group(4)}",
472 content,
473 )
474 content = re.sub(
475 r"\]\(([^\)\s]+)\.md(#[^\)]*)?\)",
476 lambda m: f"]({m.group(1)}.html{m.group(2) or ''})",
477 content,
478 )
480 # Normalize developers relative links to directory indexes
481 content = re.sub(
482 r'(href=")\./(architecture|testing|deployment|extending)\.html(")',
483 r"\1./\2/\3",
484 content,
485 )
486 # Normalize absolute developers/*.html to directory indexes
487 content = re.sub(
488 r'(href=")([^"\s]*/developers/)(architecture|testing|deployment|extending)\.html(")',
489 r"\1\2\3/\4",
490 content,
491 )
492 content = re.sub(
493 r"\]\(([^\)\s]*/developers/)(architecture|testing|deployment|extending)\.html\)",
494 r"](\1\2/)",
495 content,
496 )
497 # Normalize parent-relative developers links like ../extending.html to ../extending/
498 content = re.sub(
499 r'(href=")([^"#]*/developers/)(architecture|testing|deployment|extending)\.html(#[^"]*)?(")',
500 r"\1\2\3/\4\5",
501 content,
502 )
503 # Normalize sibling links such as ../extending.html -> ../extending/
504 content = re.sub(
505 r'(href=")\.\./(architecture|testing|deployment|extending)\.html(#[^"]*)?(")',
506 r"\1../\2/\3\4",
507 content,
508 )
509 content = re.sub(
510 r"\]\(\.\./(architecture|testing|deployment|extending)\.html(#[^\)]*)?\)",
511 r"](../\1/\2)",
512 content,
513 )
515 # Ensure well-known repo root files under /docs have .html extension
516 content = re.sub(
517 r'(href=")(/docs/(?:LICENSE|README|CHANGELOG|CONTRIBUTING))(#[^"]*)?(")',
518 r"\1\2.html\3\4",
519 content,
520 )
522 # If a target output path is provided, convert absolute /docs/... links to relative ones
523 if target_dir:
524 try:
525 import posixpath
527 base_dir = target_dir
528 if not base_dir.endswith("/"):
529 base_dir = posixpath.dirname(base_dir) + "/"
531 def _to_relative_html(match: re.Match) -> str:
532 prefix, path_part, anchor, suffix = (
533 match.group(1),
534 match.group(2),
535 match.group(3) or "",
536 match.group(4),
537 )
538 abs_path = "docs/" + path_part
539 rel = posixpath.relpath(abs_path, base_dir.rstrip("/"))
540 return f'{prefix}{rel}{anchor or ""}{suffix}'
542 def _to_relative_md(match: re.Match) -> str:
543 path_part, anchor = match.group(1), match.group(2) or ""
544 abs_path = "docs/" + path_part
545 rel = posixpath.relpath(abs_path, base_dir.rstrip("/"))
546 return f"]({rel}{anchor})"
548 content = re.sub(
549 r'(href=")/docs/([^"#]+)(#[^"]*)?(")',
550 _to_relative_html,
551 content,
552 )
553 content = re.sub(
554 r"\]\(/docs/([^\)#]+)(#[^\)]*)?\)", _to_relative_md, content
555 )
556 except Exception:
557 # Fallback silently if relative conversion fails
558 pass
560 return content
562 def _process_link_path(self, link: str, source_file: str = "") -> str:
563 """Process a link path for conversion."""
564 # Preserve anchor fragments while processing
565 anchor = ""
566 if "#" in link:
567 link, anchor = link.split("#", 1)
568 anchor = "#" + anchor
570 # Only rewrite to absolute /docs when building from a source file context
571 if source_file:
572 # ../../docs/... -> /docs/...
573 link = re.sub(r"^(?:\.{2}/)+docs/", "/docs/", link)
574 # ./docs/... -> /docs/...
575 link = re.sub(r"^\./docs/", "/docs/", link)
576 # docs/... (relative) -> /docs/...
577 if link.startswith("docs/"):
578 link = "/" + link
580 # Decide whether to convert .md to .html (preserving anchors)
581 should_convert_md = True
582 if anchor and "/" not in link and not source_file:
583 # Preserve bare filename.md#anchor in tests (no source context)
584 should_convert_md = False
586 if link.endswith(".md") and should_convert_md:
587 link = link[:-3] + ".html"
588 else:
589 # Handle well-known files without extensions
590 filename = link.split("/")[-1]
591 if (
592 filename.upper() in ["LICENSE", "README", "CHANGELOG", "CONTRIBUTING"]
593 and "." not in filename
594 ):
595 # Ensure these resolve under /docs when referenced from packages
596 if (
597 source_file
598 and not link.startswith("/docs/")
599 and filename.upper()
600 in ["LICENSE", "README", "CHANGELOG", "CONTRIBUTING"]
601 ):
602 # Nudge to /docs root for repo-wide files
603 link = "/docs/" + filename
604 link = link + ".html"
606 # Collapse accidental duplicate /docs/docs prefixes
607 link = re.sub(r"^/docs/docs/", "/docs/", link)
608 link = link.replace("docs/docs/", "docs/")
610 # Ensure absolute /docs/ links are normalized (only when building)
611 if source_file and link.startswith("docs/"):
612 link = "/" + link
614 return link + anchor
616 def render_toc(self, html_content: str) -> str:
617 """Generate table of contents from HTML headings."""
619 # Find all headings
620 heading_pattern = r'<(h[1-6])[^>]*id="([^"]+)"[^>]*>([^<]+)</h[1-6]>'
621 headings = re.findall(heading_pattern, html_content)
623 if not headings:
624 return ""
626 toc_html = '<div class="toc"><h3>Table of Contents</h3>'
628 # Build hierarchical structure
629 current_level = 0
630 open_lists = 0
632 for tag, heading_id, text in headings:
633 level = int(tag[1]) # Extract number from h1, h2, etc.
635 # Handle level changes
636 if level > current_level:
637 # Open new nested lists for deeper levels
638 while current_level < level:
639 if current_level == 0:
640 toc_html += "<ul>"
641 else:
642 toc_html += "<ul>"
643 open_lists += 1
644 current_level += 1
645 elif level < current_level:
646 # Close lists for shallower levels
647 while current_level > level:
648 toc_html += "</ul>"
649 open_lists -= 1
650 current_level -= 1
652 # Add the current heading
653 toc_html += f'<li><a href="#{heading_id}">{text}</a></li>\n'
655 # Close all remaining open lists
656 while open_lists > 0:
657 toc_html += "</ul>"
658 open_lists -= 1
660 toc_html += "</div>"
662 return toc_html