Coverage for src/qdrant_loader/connectors/publicdocs/crawler.py: 80%
85 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-08 06:05 +0000
1from __future__ import annotations
3import fnmatch
4from typing import Any
5from urllib.parse import urljoin, urlparse
7from bs4 import BeautifulSoup
9from qdrant_loader.connectors.publicdocs.http import read_text_response as _read_text
12async def discover_pages(
13 session: Any,
14 base_url: str,
15 *,
16 path_pattern: str | None,
17 exclude_paths: list[str],
18 logger: Any,
19) -> list[str]:
20 """Fetch the base URL and discover matching pages under it."""
21 # Support both aiohttp-style context manager and direct-await mocks
22 status_code = None
23 response = None
24 html = ""
25 context_managed = False
26 try:
27 get_result = session.get(base_url)
29 # Prefer async context manager usage if supported (aiohttp-style)
30 if hasattr(get_result, "__aenter__") and hasattr(get_result, "__aexit__"):
31 context_managed = True
32 async with get_result as resp: # type: ignore[func-returns-value]
33 response = resp
34 status = getattr(response, "status", None)
35 status_code = status
36 if status is None or not (200 <= int(status) < 300):
37 logger.warning(
38 "Non-2xx HTTP status when fetching base URL",
39 url=base_url,
40 status_code=status,
41 )
42 return []
43 try:
44 html = await _read_text(response)
45 except Exception as e:
46 logger.warning(
47 "Failed to read HTTP response body", url=base_url, error=str(e)
48 )
49 return []
50 else:
51 # Otherwise await if it's awaitable, or use it directly
52 if hasattr(get_result, "__await__"):
53 response = await get_result # type: ignore[assignment]
54 else:
55 response = get_result
57 status = getattr(response, "status", None)
58 status_code = status
59 if status is None or not (200 <= int(status) < 300):
60 logger.warning(
61 "Non-2xx HTTP status when fetching base URL",
62 url=base_url,
63 status_code=status,
64 )
65 return []
66 try:
67 html = await _read_text(response)
68 except Exception as e:
69 logger.warning(
70 "Failed to read HTTP response body", url=base_url, error=str(e)
71 )
72 return []
73 except Exception as e:
74 logger.warning("HTTP request failed", url=base_url, error=str(e))
75 return []
76 finally:
77 # Best-effort close for non-context-managed responses
78 if response is not None and not context_managed:
79 close = getattr(response, "close", None)
80 if callable(close):
81 try:
82 close()
83 except Exception:
84 pass
86 if status_code is not None:
87 logger.debug("HTTP request successful", status_code=status_code)
88 logger.debug(
89 "Received HTML response",
90 status_code=status_code,
91 content_length=len(html),
92 )
94 soup = BeautifulSoup(html, "html.parser")
95 pages = [base_url]
96 seen: set[str] = {base_url}
97 base_parsed = urlparse(base_url)
98 base_netloc = base_parsed.netloc
99 base_path = base_parsed.path or ""
101 for link in soup.find_all("a"):
102 try:
103 href = link.get("href")
104 if not href or not isinstance(href, str):
105 continue
107 if href.startswith("#"):
108 continue
110 absolute_url = urljoin(base_url, href)
111 absolute_url = absolute_url.split("#")[0]
112 parsed = urlparse(absolute_url)
114 # Validate scheme
115 if parsed.scheme not in ("http", "https"):
116 continue
118 # Enforce same-origin
119 if parsed.netloc != base_netloc:
120 continue
122 # Enforce base path scope
123 abs_path = parsed.path or "/"
124 base_path_norm = base_path.rstrip("/")
125 if base_path_norm:
126 if not (
127 abs_path == base_path_norm
128 or abs_path.startswith(base_path_norm + "/")
129 ):
130 continue
132 if not any(exclude in absolute_url for exclude in exclude_paths) and (
133 path_pattern is None or fnmatch.fnmatch(parsed.path, path_pattern)
134 ):
135 if absolute_url not in seen:
136 seen.add(absolute_url)
137 logger.debug("Found valid page URL", url=absolute_url)
138 pages.append(absolute_url)
139 except Exception as e: # pragma: no cover - best-effort crawl
140 logger.warning(
141 "Failed to process link",
142 href=str(link.get("href", "")), # type: ignore
143 error=str(e),
144 )
145 continue
146 logger.debug("Page discovery completed", total_pages=len(pages), pages=pages)
147 return pages