Coverage for website/check_links.py: 100%

125 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-04 05:45 +0000

1#!/usr/bin/env python3 

2""" 

3Link checker script to scan the website for dead links (404 errors). 

4""" 

5 

6import requests 

7import re 

8from urllib.parse import urljoin, urlparse 

9from pathlib import Path 

10import time 

11from collections import defaultdict 

12import sys 

13 

14 

15class LinkChecker: 

16 def __init__(self, base_url="http://localhost:8000", max_depth=3): 

17 self.base_url = base_url.rstrip("/") 

18 self.max_depth = max_depth 

19 self.visited_urls = set() 

20 self.checked_links = set() 

21 self.dead_links = [] 

22 self.broken_links = defaultdict(list) # page -> [broken_links] 

23 self.session = requests.Session() 

24 self.session.headers.update({"User-Agent": "QDrant-Loader-Link-Checker/1.0"}) 

25 

26 def is_internal_url(self, url): 

27 """Check if URL is internal to our site.""" 

28 parsed = urlparse(url) 

29 base_parsed = urlparse(self.base_url) 

30 return parsed.netloc == base_parsed.netloc or not parsed.netloc 

31 

32 def normalize_url(self, url): 

33 """Normalize URL for consistent checking.""" 

34 # Remove fragment 

35 if "#" in url: 

36 url = url.split("#")[0] 

37 # Ensure trailing slash for directories 

38 if url.endswith("/index.html"): 

39 url = url[:-10] # Remove index.html 

40 return url 

41 

42 def extract_links_from_html(self, html_content, base_url): 

43 """Extract all links from HTML content.""" 

44 links = set() 

45 

46 # Find all href attributes 

47 href_pattern = r'href=["\']([^"\']+)["\']' 

48 for match in re.finditer(href_pattern, html_content): 

49 link = match.group(1) 

50 if link.startswith("javascript:") or link.startswith("mailto:"): 

51 continue 

52 full_url = urljoin(base_url, link) 

53 links.add(self.normalize_url(full_url)) 

54 

55 # Find all src attributes (for images, scripts, etc.) 

56 src_pattern = r'src=["\']([^"\']+)["\']' 

57 for match in re.finditer(src_pattern, html_content): 

58 link = match.group(1) 

59 if link.startswith("data:"): 

60 continue 

61 full_url = urljoin(base_url, link) 

62 links.add(self.normalize_url(full_url)) 

63 

64 return links 

65 

66 def check_url(self, url): 

67 """Check if a URL is accessible.""" 

68 try: 

69 response = self.session.head(url, timeout=10, allow_redirects=True) 

70 if response.status_code == 405: # Method not allowed, try GET 

71 response = self.session.get(url, timeout=10, allow_redirects=True) 

72 return response.status_code, response.reason 

73 except requests.exceptions.RequestException as e: 

74 return None, str(e) 

75 

76 def crawl_page(self, url, depth=0): 

77 """Crawl a single page and extract links.""" 

78 if depth > self.max_depth or url in self.visited_urls: 

79 return 

80 

81 print(f"{' ' * depth}Crawling: {url}") 

82 self.visited_urls.add(url) 

83 

84 try: 

85 response = self.session.get(url, timeout=10) 

86 if response.status_code != 200: 

87 print(f"{' ' * depth}⚠️ Page returned {response.status_code}: {url}") 

88 return 

89 

90 # Extract links from this page 

91 links = self.extract_links_from_html(response.text, url) 

92 

93 # Check each link 

94 for link in links: 

95 if link not in self.checked_links: 

96 self.checked_links.add(link) 

97 status_code, reason = self.check_url(link) 

98 

99 if status_code is None or status_code >= 400: 

100 self.dead_links.append( 

101 { 

102 "url": link, 

103 "status": status_code, 

104 "reason": reason, 

105 "found_on": url, 

106 } 

107 ) 

108 self.broken_links[url].append(link) 

109 print( 

110 f"{' ' * depth}❌ BROKEN: {link} ({status_code}: {reason})" 

111 ) 

112 elif status_code >= 300: 

113 print(f"{' ' * depth}🔄 REDIRECT: {link} ({status_code})") 

114 else: 

115 print(f"{' ' * depth}✅ OK: {link}") 

116 

117 # Small delay to be nice to the server 

118 time.sleep(0.1) 

119 

120 # Recursively crawl internal HTML pages 

121 if ( 

122 self.is_internal_url(link) 

123 and depth < self.max_depth 

124 and link not in self.visited_urls 

125 and ( 

126 link.endswith(".html") 

127 or link.endswith("/") 

128 or "." not in Path(urlparse(link).path).name 

129 ) 

130 ): 

131 self.crawl_page(link, depth + 1) 

132 

133 except requests.exceptions.RequestException as e: 

134 print(f"{' ' * depth}❌ ERROR crawling {url}: {e}") 

135 

136 def run_check(self): 

137 """Run the complete link check.""" 

138 print(f"🔍 Starting link check for {self.base_url}") 

139 print(f"📊 Max depth: {self.max_depth}") 

140 print("=" * 60) 

141 

142 start_time = time.time() 

143 self.crawl_page(self.base_url) 

144 end_time = time.time() 

145 

146 print("\n" + "=" * 60) 

147 print("📋 LINK CHECK SUMMARY") 

148 print("=" * 60) 

149 print(f"⏱️ Time taken: {end_time - start_time:.2f} seconds") 

150 print(f"🌐 Pages crawled: {len(self.visited_urls)}") 

151 print(f"🔗 Links checked: {len(self.checked_links)}") 

152 print(f"❌ Broken links found: {len(self.dead_links)}") 

153 

154 if self.dead_links: 

155 print("\n💥 BROKEN LINKS DETAILS:") 

156 print("-" * 40) 

157 for link_info in self.dead_links: 

158 print(f"URL: {link_info['url']}") 

159 print(f"Status: {link_info['status']} - {link_info['reason']}") 

160 print(f"Found on: {link_info['found_on']}") 

161 print("-" * 40) 

162 

163 print("\n📄 PAGES WITH BROKEN LINKS:") 

164 for page, broken_links in self.broken_links.items(): 

165 print(f"\n{page}:") 

166 for link in broken_links: 

167 print(f"{link}") 

168 else: 

169 print("\n🎉 No broken links found! All links are working correctly.") 

170 

171 return len(self.dead_links) == 0 

172 

173 

174def main(): 

175 """Main entry point.""" 

176 import argparse 

177 

178 parser = argparse.ArgumentParser(description="Check website for broken links") 

179 parser.add_argument( 

180 "--url", default="http://localhost:8000", help="Base URL to check" 

181 ) 

182 parser.add_argument("--depth", type=int, default=3, help="Maximum crawl depth") 

183 parser.add_argument( 

184 "--external", action="store_true", help="Also check external links" 

185 ) 

186 

187 args = parser.parse_args() 

188 

189 checker = LinkChecker(args.url, args.depth) 

190 

191 try: 

192 success = checker.run_check() 

193 sys.exit(0 if success else 1) 

194 except KeyboardInterrupt: 

195 print("\n🛑 Link check interrupted by user") 

196 sys.exit(1) 

197 except Exception as e: 

198 print(f"\n❌ Link check failed: {e}") 

199 sys.exit(1) 

200 

201 

202if __name__ == "__main__": 

203 main()