Coverage for src/qdrant_loader/connectors/publicdocs/crawler.py: 80%

85 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:05 +0000

1from __future__ import annotations 

2 

3import fnmatch 

4from typing import Any 

5from urllib.parse import urljoin, urlparse 

6 

7from bs4 import BeautifulSoup 

8 

9from qdrant_loader.connectors.publicdocs.http import read_text_response as _read_text 

10 

11 

12async def discover_pages( 

13 session: Any, 

14 base_url: str, 

15 *, 

16 path_pattern: str | None, 

17 exclude_paths: list[str], 

18 logger: Any, 

19) -> list[str]: 

20 """Fetch the base URL and discover matching pages under it.""" 

21 # Support both aiohttp-style context manager and direct-await mocks 

22 status_code = None 

23 response = None 

24 html = "" 

25 context_managed = False 

26 try: 

27 get_result = session.get(base_url) 

28 

29 # Prefer async context manager usage if supported (aiohttp-style) 

30 if hasattr(get_result, "__aenter__") and hasattr(get_result, "__aexit__"): 

31 context_managed = True 

32 async with get_result as resp: # type: ignore[func-returns-value] 

33 response = resp 

34 status = getattr(response, "status", None) 

35 status_code = status 

36 if status is None or not (200 <= int(status) < 300): 

37 logger.warning( 

38 "Non-2xx HTTP status when fetching base URL", 

39 url=base_url, 

40 status_code=status, 

41 ) 

42 return [] 

43 try: 

44 html = await _read_text(response) 

45 except Exception as e: 

46 logger.warning( 

47 "Failed to read HTTP response body", url=base_url, error=str(e) 

48 ) 

49 return [] 

50 else: 

51 # Otherwise await if it's awaitable, or use it directly 

52 if hasattr(get_result, "__await__"): 

53 response = await get_result # type: ignore[assignment] 

54 else: 

55 response = get_result 

56 

57 status = getattr(response, "status", None) 

58 status_code = status 

59 if status is None or not (200 <= int(status) < 300): 

60 logger.warning( 

61 "Non-2xx HTTP status when fetching base URL", 

62 url=base_url, 

63 status_code=status, 

64 ) 

65 return [] 

66 try: 

67 html = await _read_text(response) 

68 except Exception as e: 

69 logger.warning( 

70 "Failed to read HTTP response body", url=base_url, error=str(e) 

71 ) 

72 return [] 

73 except Exception as e: 

74 logger.warning("HTTP request failed", url=base_url, error=str(e)) 

75 return [] 

76 finally: 

77 # Best-effort close for non-context-managed responses 

78 if response is not None and not context_managed: 

79 close = getattr(response, "close", None) 

80 if callable(close): 

81 try: 

82 close() 

83 except Exception: 

84 pass 

85 

86 if status_code is not None: 

87 logger.debug("HTTP request successful", status_code=status_code) 

88 logger.debug( 

89 "Received HTML response", 

90 status_code=status_code, 

91 content_length=len(html), 

92 ) 

93 

94 soup = BeautifulSoup(html, "html.parser") 

95 pages = [base_url] 

96 seen: set[str] = {base_url} 

97 base_parsed = urlparse(base_url) 

98 base_netloc = base_parsed.netloc 

99 base_path = base_parsed.path or "" 

100 

101 for link in soup.find_all("a"): 

102 try: 

103 href = link.get("href") 

104 if not href or not isinstance(href, str): 

105 continue 

106 

107 if href.startswith("#"): 

108 continue 

109 

110 absolute_url = urljoin(base_url, href) 

111 absolute_url = absolute_url.split("#")[0] 

112 parsed = urlparse(absolute_url) 

113 

114 # Validate scheme 

115 if parsed.scheme not in ("http", "https"): 

116 continue 

117 

118 # Enforce same-origin 

119 if parsed.netloc != base_netloc: 

120 continue 

121 

122 # Enforce base path scope 

123 abs_path = parsed.path or "/" 

124 base_path_norm = base_path.rstrip("/") 

125 if base_path_norm: 

126 if not ( 

127 abs_path == base_path_norm 

128 or abs_path.startswith(base_path_norm + "/") 

129 ): 

130 continue 

131 

132 if not any(exclude in absolute_url for exclude in exclude_paths) and ( 

133 path_pattern is None or fnmatch.fnmatch(parsed.path, path_pattern) 

134 ): 

135 if absolute_url not in seen: 

136 seen.add(absolute_url) 

137 logger.debug("Found valid page URL", url=absolute_url) 

138 pages.append(absolute_url) 

139 except Exception as e: # pragma: no cover - best-effort crawl 

140 logger.warning( 

141 "Failed to process link", 

142 href=str(link.get("href", "")), # type: ignore 

143 error=str(e), 

144 ) 

145 continue 

146 logger.debug("Page discovery completed", total_pages=len(pages), pages=pages) 

147 return pages