Coverage for src/qdrant_loader/connectors/publicdocs/crawler.py: 80%

1from __future__ import annotations

3import fnmatch

4from typing import Any

5from urllib.parse import urljoin, urlparse

7from bs4 import BeautifulSoup

9from qdrant_loader.connectors.publicdocs.http import read_text_response as _read_text

12async def discover_pages(

13 session: Any,

14 base_url: str,

15 *,

16 path_pattern: str | None,

17 exclude_paths: list[str],

18 logger: Any,

19) -> list[str]:

20 """Fetch the base URL and discover matching pages under it."""

21 # Support both aiohttp-style context manager and direct-await mocks

22 status_code = None

23 response = None

24 html = ""

25 context_managed = False

26 try:

27 get_result = session.get(base_url)

29 # Prefer async context manager usage if supported (aiohttp-style)

30 if hasattr(get_result, "__aenter__") and hasattr(get_result, "__aexit__"):

31 context_managed = True

32 async with get_result as resp: # type: ignore[func-returns-value]

33 response = resp

34 status = getattr(response, "status", None)

35 status_code = status

36 if status is None or not (200 <= int(status) < 300):

37 logger.warning(

38 "Non-2xx HTTP status when fetching base URL",

39 url=base_url,

40 status_code=status,

41 )

42 return []

43 try:

44 html = await _read_text(response)

45 except Exception as e:

46 logger.warning(

47 "Failed to read HTTP response body", url=base_url, error=str(e)

48 )

49 return []

50 else:

51 # Otherwise await if it's awaitable, or use it directly

52 if hasattr(get_result, "__await__"):

53 response = await get_result # type: ignore[assignment]

54 else:

55 response = get_result

57 status = getattr(response, "status", None)

58 status_code = status

59 if status is None or not (200 <= int(status) < 300):

60 logger.warning(

61 "Non-2xx HTTP status when fetching base URL",

62 url=base_url,

63 status_code=status,

64 )

65 return []

66 try:

67 html = await _read_text(response)

68 except Exception as e:

69 logger.warning(

70 "Failed to read HTTP response body", url=base_url, error=str(e)

71 )

72 return []

73 except Exception as e:

74 logger.warning("HTTP request failed", url=base_url, error=str(e))

75 return []

76 finally:

77 # Best-effort close for non-context-managed responses

78 if response is not None and not context_managed:

79 close = getattr(response, "close", None)

80 if callable(close):

81 try:

82 close()

83 except Exception:

84 pass

86 if status_code is not None:

87 logger.debug("HTTP request successful", status_code=status_code)

88 logger.debug(

89 "Received HTML response",

90 status_code=status_code,

91 content_length=len(html),

92 )

94 soup = BeautifulSoup(html, "html.parser")

95 pages = [base_url]

96 seen: set[str] = {base_url}

97 base_parsed = urlparse(base_url)

98 base_netloc = base_parsed.netloc

99 base_path = base_parsed.path or ""

100

101 for link in soup.find_all("a"):

102 try:

103 href = link.get("href")

104 if not href or not isinstance(href, str):

105 continue

106

107 if href.startswith("#"):

108 continue

109

110 absolute_url = urljoin(base_url, href)

111 absolute_url = absolute_url.split("#")[0]

112 parsed = urlparse(absolute_url)

113

114 # Validate scheme

115 if parsed.scheme not in ("http", "https"):

116 continue

117

118 # Enforce same-origin

119 if parsed.netloc != base_netloc:

120 continue

121

122 # Enforce base path scope

123 abs_path = parsed.path or "/"

124 base_path_norm = base_path.rstrip("/")

125 if base_path_norm:

126 if not (

127 abs_path == base_path_norm

128 or abs_path.startswith(base_path_norm + "/")

129 ):

130 continue

131

132 if not any(exclude in absolute_url for exclude in exclude_paths) and (

133 path_pattern is None or fnmatch.fnmatch(parsed.path, path_pattern)

134 ):

135 if absolute_url not in seen:

136 seen.add(absolute_url)

137 logger.debug("Found valid page URL", url=absolute_url)

138 pages.append(absolute_url)

139 except Exception as e: # pragma: no cover - best-effort crawl

140 logger.warning(

141 "Failed to process link",

142 href=str(link.get("href", "")), # type: ignore

143 error=str(e),

144 )

145 continue

146 logger.debug("Page discovery completed", total_pages=len(pages), pages=pages)

147 return pages