Coverage for website/check_links.py: 92%

173 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-08 06:03 +0000

1#!/usr/bin/env python3 

2""" 

3Link checker script to scan the website for dead links (404 errors). 

4""" 

5 

6import re 

7import sys 

8import time 

9from collections import defaultdict 

10from pathlib import Path 

11from urllib.parse import urljoin, urlparse 

12 

13import requests 

14 

15 

16class LinkChecker: 

17 def __init__( 

18 self, 

19 base_url="http://127.0.0.1:3000/website/site/", 

20 max_depth=3, 

21 check_external=True, 

22 ): 

23 # Preserve a slash-suffixed base for correct joining 

24 self.base_url = base_url.rstrip("/") 

25 self.base_url_slash = self.base_url + "/" 

26 self.max_depth = max_depth 

27 self.check_external = check_external 

28 self.visited_urls = set() 

29 self.checked_links = set() 

30 self.dead_links = [] 

31 self.broken_links = defaultdict(list) # page -> [broken_links] 

32 self.session = requests.Session() 

33 self.session.headers.update({"User-Agent": "QDrant-Loader-Link-Checker/1.0"}) 

34 

35 # Compute site path prefix (e.g., "/website/site") to keep crawling within built site 

36 parsed = urlparse(self.base_url_slash) 

37 self.base_scheme = parsed.scheme 

38 self.base_netloc = parsed.netloc 

39 # Normalize prefix: if base points to a file like "/site/index.html", use its directory ("/site") 

40 path = parsed.path or "/" 

41 if path and not path.endswith("/"): 

42 last_segment = path.rsplit("/", 1)[-1] 

43 if "." in last_segment: 

44 # Drop the filename, keep the directory 

45 path = path.rsplit("/", 1)[0] or "/" 

46 # Ensure prefix always starts with "/" and has no trailing slash ("/website/site") 

47 self.base_path_prefix = (path if path.startswith("/") else "/" + path).rstrip( 

48 "/" 

49 ) or "/" 

50 # Derive site root prefix from the first path segment (e.g., "/site" from "/site/docs") 

51 segments = [seg for seg in self.base_path_prefix.split("/") if seg] 

52 if segments: 

53 self.site_root_prefix = "/" + segments[0] 

54 else: 

55 self.site_root_prefix = "/" 

56 

57 def is_internal_url(self, url): 

58 """Check if URL is internal to our site and within the site path prefix. 

59 

60 Relative URLs (no scheme and no netloc) are considered internal. 

61 Absolute http(s) URLs must match host and live under the base path prefix. 

62 """ 

63 parsed = urlparse(url) 

64 # Relative URL -> internal 

65 if not parsed.scheme and not parsed.netloc: 

66 return True 

67 # Only http(s) are considered for internal checks 

68 if parsed.scheme not in ("http", "https"): 

69 return False 

70 # Must be same host if netloc present 

71 if parsed.netloc and parsed.netloc != self.base_netloc: 

72 return False 

73 path = parsed.path or "/" 

74 return path.startswith(self.base_path_prefix) 

75 

76 def normalize_url(self, url): 

77 """Normalize URL for consistent checking.""" 

78 # Remove fragment 

79 if "#" in url: 

80 url = url.split("#")[0] 

81 # Ensure trailing slash for directories 

82 if url.endswith("/index.html"): 

83 url = url[:-10] # Remove index.html 

84 return url 

85 

86 def extract_links_from_html(self, html_content, current_url): 

87 """Extract all links from HTML content.""" 

88 links = set() 

89 

90 # Strip code/pre/highlight blocks so we don't validate links shown inside code examples 

91 try: 

92 code_like_pattern = re.compile( 

93 r"<pre[\s\S]*?</pre>|<code[\s\S]*?</code>|<div[^>]*class=\"[^\"]*highlight[^\"]*\"[^>]*>[\s\S]*?</div>", 

94 re.IGNORECASE, 

95 ) 

96 sanitized_html = re.sub(code_like_pattern, "", html_content) 

97 except Exception: 

98 sanitized_html = html_content 

99 

100 # Helper to join links correctly respecting site prefix and current page 

101 def join_link(link: str) -> str: 

102 # Protocol-relative URLs 

103 if link.startswith("//"): 

104 return f"{self.base_scheme}:{link}" 

105 # Absolute URLs with scheme 

106 if re.match(r"^[a-zA-Z][a-zA-Z0-9+.-]*:", link): 

107 # Keep http/https links as-is; others will be filtered by callers 

108 return link 

109 # Absolute root links -> prefix with site root path (e.g., /docs/ -> /site/docs/) 

110 if link.startswith("/"): 

111 # If the link already includes the full base path prefix, keep as-is 

112 if ( 

113 link.startswith(self.base_path_prefix + "/") 

114 or link == self.base_path_prefix 

115 ): 

116 return f"{self.base_scheme}://{self.base_netloc}{link}" 

117 # Otherwise, join with the site root prefix so /docs -> /site/docs on local dev 

118 if self.site_root_prefix == "/": 

119 return f"{self.base_scheme}://{self.base_netloc}{link}" 

120 return f"{self.base_scheme}://{self.base_netloc}{self.site_root_prefix}{link}" 

121 # Relative link 

122 base_dir = ( 

123 current_url 

124 if current_url.endswith("/") 

125 else current_url.rsplit("/", 1)[0] + "/" 

126 ) 

127 return urljoin(base_dir, link) 

128 

129 # Find all href attributes 

130 href_pattern = r'href=["\']([^"\']+)["\']' 

131 for match in re.finditer(href_pattern, sanitized_html): 

132 link = match.group(1) 

133 if ( 

134 link.startswith("javascript:") 

135 or link.startswith("mailto:") 

136 or link.startswith("tel:") 

137 ): 

138 continue 

139 full_url = join_link(link) 

140 # Skip dev server injected scripts 

141 if "___vscode_livepreview_injected_script" in full_url: 

142 continue 

143 links.add(self.normalize_url(full_url)) 

144 

145 # Find all src attributes (for images, scripts, etc.) 

146 src_pattern = r'src=["\']([^"\']+)["\']' 

147 for match in re.finditer(src_pattern, sanitized_html): 

148 link = match.group(1) 

149 if link.startswith("data:"): 

150 continue 

151 full_url = join_link(link) 

152 if "___vscode_livepreview_injected_script" in full_url: 

153 continue 

154 links.add(self.normalize_url(full_url)) 

155 

156 return links 

157 

158 def check_url(self, url): 

159 """Check if a URL is accessible.""" 

160 try: 

161 response = self.session.head(url, timeout=10, allow_redirects=True) 

162 if response.status_code == 405: # Method not allowed, try GET 

163 response = self.session.get(url, timeout=10, allow_redirects=True) 

164 return response.status_code, response.reason 

165 except requests.exceptions.RequestException as e: 

166 return None, str(e) 

167 

168 def crawl_page(self, url, depth=0): 

169 """Crawl a single page and extract links.""" 

170 if depth > self.max_depth or url in self.visited_urls: 

171 return 

172 

173 print(f"{' ' * depth}Crawling: {url}") 

174 self.visited_urls.add(url) 

175 

176 try: 

177 response = self.session.get(url, timeout=10) 

178 if response.status_code != 200: 

179 print(f"{' ' * depth}⚠️ Page returned {response.status_code}: {url}") 

180 return 

181 

182 # Extract links from this page 

183 links = self.extract_links_from_html(response.text, url) 

184 

185 # Check each link 

186 for link in links: 

187 if link not in self.checked_links: 

188 self.checked_links.add(link) 

189 # Skip external links unless explicitly enabled 

190 if not self.is_internal_url(link) and not self.check_external: 

191 continue 

192 

193 status_code, reason = self.check_url(link) 

194 

195 if status_code is None or status_code >= 400: 

196 self.dead_links.append( 

197 { 

198 "url": link, 

199 "status": status_code, 

200 "reason": reason, 

201 "found_on": url, 

202 } 

203 ) 

204 self.broken_links[url].append(link) 

205 print( 

206 f"{' ' * depth}❌ BROKEN: {link} ({status_code}: {reason})" 

207 ) 

208 elif status_code >= 300: 

209 print(f"{' ' * depth}🔄 REDIRECT: {link} ({status_code})") 

210 else: 

211 print(f"{' ' * depth}✅ OK: {link}") 

212 

213 # Small delay to be nice to the server 

214 time.sleep(0.1) 

215 

216 # Recursively crawl internal HTML pages only within site prefix 

217 if ( 

218 self.is_internal_url(link) 

219 and depth < self.max_depth 

220 and link not in self.visited_urls 

221 and ( 

222 link.endswith(".html") 

223 or link.endswith("/") 

224 or "." not in Path(urlparse(link).path).name 

225 ) 

226 ): 

227 self.crawl_page(link, depth + 1) 

228 

229 except requests.exceptions.RequestException as e: 

230 print(f"{' ' * depth}❌ ERROR crawling {url}: {e}") 

231 

232 def run_check(self): 

233 """Run the complete link check.""" 

234 print(f"🔍 Starting link check for {self.base_url}") 

235 print(f"📊 Max depth: {self.max_depth}") 

236 print("=" * 60) 

237 

238 start_time = time.time() 

239 self.crawl_page(self.base_url) 

240 end_time = time.time() 

241 

242 print("\n" + "=" * 60) 

243 print("📋 LINK CHECK SUMMARY") 

244 print("=" * 60) 

245 print(f"⏱️ Time taken: {end_time - start_time:.2f} seconds") 

246 print(f"🌐 Pages crawled: {len(self.visited_urls)}") 

247 print(f"🔗 Links checked: {len(self.checked_links)}") 

248 print(f"❌ Broken links found: {len(self.dead_links)}") 

249 

250 if self.dead_links: 

251 print("\n💥 BROKEN LINKS DETAILS:") 

252 print("-" * 40) 

253 for link_info in self.dead_links: 

254 print(f"URL: {link_info['url']}") 

255 print(f"Status: {link_info['status']} - {link_info['reason']}") 

256 print(f"Found on: {link_info['found_on']}") 

257 print("-" * 40) 

258 

259 print("\n📄 PAGES WITH BROKEN LINKS:") 

260 for page, broken_links in self.broken_links.items(): 

261 print(f"\n{page}:") 

262 for link in broken_links: 

263 print(f"{link}") 

264 else: 

265 print("\n🎉 No broken links found! All links are working correctly.") 

266 

267 return len(self.dead_links) == 0 

268 

269 

270def main(): 

271 """Main entry point.""" 

272 import argparse 

273 

274 parser = argparse.ArgumentParser(description="Check website for broken links") 

275 parser.add_argument( 

276 "--url", default="http://127.0.0.1:3000/website/site", help="Base URL to check" 

277 ) 

278 parser.add_argument("--depth", type=int, default=3, help="Maximum crawl depth") 

279 parser.add_argument( 

280 "--external", action="store_true", help="Also check external links" 

281 ) 

282 

283 args = parser.parse_args() 

284 

285 if args.external: 

286 checker = LinkChecker(args.url, args.depth, check_external=True) 

287 else: 

288 # Instantiate without the keyword to preserve CLI test expectation, 

289 # then disable external link checking by default for CLI runs. 

290 checker = LinkChecker(args.url, args.depth) 

291 checker.check_external = False 

292 

293 try: 

294 success = checker.run_check() 

295 sys.exit(0 if success else 1) 

296 except KeyboardInterrupt: 

297 print("\n🛑 Link check interrupted by user") 

298 sys.exit(1) 

299 except Exception as e: 

300 print(f"\n❌ Link check failed: {e}") 

301 sys.exit(1) 

302 

303 

304if __name__ == "__main__": 

305 main()