Coverage for website/check_links.py: 100%
125 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:45 +0000
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-04 05:45 +0000
1#!/usr/bin/env python3
2"""
3Link checker script to scan the website for dead links (404 errors).
4"""
6import requests
7import re
8from urllib.parse import urljoin, urlparse
9from pathlib import Path
10import time
11from collections import defaultdict
12import sys
15class LinkChecker:
16 def __init__(self, base_url="http://localhost:8000", max_depth=3):
17 self.base_url = base_url.rstrip("/")
18 self.max_depth = max_depth
19 self.visited_urls = set()
20 self.checked_links = set()
21 self.dead_links = []
22 self.broken_links = defaultdict(list) # page -> [broken_links]
23 self.session = requests.Session()
24 self.session.headers.update({"User-Agent": "QDrant-Loader-Link-Checker/1.0"})
26 def is_internal_url(self, url):
27 """Check if URL is internal to our site."""
28 parsed = urlparse(url)
29 base_parsed = urlparse(self.base_url)
30 return parsed.netloc == base_parsed.netloc or not parsed.netloc
32 def normalize_url(self, url):
33 """Normalize URL for consistent checking."""
34 # Remove fragment
35 if "#" in url:
36 url = url.split("#")[0]
37 # Ensure trailing slash for directories
38 if url.endswith("/index.html"):
39 url = url[:-10] # Remove index.html
40 return url
42 def extract_links_from_html(self, html_content, base_url):
43 """Extract all links from HTML content."""
44 links = set()
46 # Find all href attributes
47 href_pattern = r'href=["\']([^"\']+)["\']'
48 for match in re.finditer(href_pattern, html_content):
49 link = match.group(1)
50 if link.startswith("javascript:") or link.startswith("mailto:"):
51 continue
52 full_url = urljoin(base_url, link)
53 links.add(self.normalize_url(full_url))
55 # Find all src attributes (for images, scripts, etc.)
56 src_pattern = r'src=["\']([^"\']+)["\']'
57 for match in re.finditer(src_pattern, html_content):
58 link = match.group(1)
59 if link.startswith("data:"):
60 continue
61 full_url = urljoin(base_url, link)
62 links.add(self.normalize_url(full_url))
64 return links
66 def check_url(self, url):
67 """Check if a URL is accessible."""
68 try:
69 response = self.session.head(url, timeout=10, allow_redirects=True)
70 if response.status_code == 405: # Method not allowed, try GET
71 response = self.session.get(url, timeout=10, allow_redirects=True)
72 return response.status_code, response.reason
73 except requests.exceptions.RequestException as e:
74 return None, str(e)
76 def crawl_page(self, url, depth=0):
77 """Crawl a single page and extract links."""
78 if depth > self.max_depth or url in self.visited_urls:
79 return
81 print(f"{' ' * depth}Crawling: {url}")
82 self.visited_urls.add(url)
84 try:
85 response = self.session.get(url, timeout=10)
86 if response.status_code != 200:
87 print(f"{' ' * depth}⚠️ Page returned {response.status_code}: {url}")
88 return
90 # Extract links from this page
91 links = self.extract_links_from_html(response.text, url)
93 # Check each link
94 for link in links:
95 if link not in self.checked_links:
96 self.checked_links.add(link)
97 status_code, reason = self.check_url(link)
99 if status_code is None or status_code >= 400:
100 self.dead_links.append(
101 {
102 "url": link,
103 "status": status_code,
104 "reason": reason,
105 "found_on": url,
106 }
107 )
108 self.broken_links[url].append(link)
109 print(
110 f"{' ' * depth}❌ BROKEN: {link} ({status_code}: {reason})"
111 )
112 elif status_code >= 300:
113 print(f"{' ' * depth}🔄 REDIRECT: {link} ({status_code})")
114 else:
115 print(f"{' ' * depth}✅ OK: {link}")
117 # Small delay to be nice to the server
118 time.sleep(0.1)
120 # Recursively crawl internal HTML pages
121 if (
122 self.is_internal_url(link)
123 and depth < self.max_depth
124 and link not in self.visited_urls
125 and (
126 link.endswith(".html")
127 or link.endswith("/")
128 or "." not in Path(urlparse(link).path).name
129 )
130 ):
131 self.crawl_page(link, depth + 1)
133 except requests.exceptions.RequestException as e:
134 print(f"{' ' * depth}❌ ERROR crawling {url}: {e}")
136 def run_check(self):
137 """Run the complete link check."""
138 print(f"🔍 Starting link check for {self.base_url}")
139 print(f"📊 Max depth: {self.max_depth}")
140 print("=" * 60)
142 start_time = time.time()
143 self.crawl_page(self.base_url)
144 end_time = time.time()
146 print("\n" + "=" * 60)
147 print("📋 LINK CHECK SUMMARY")
148 print("=" * 60)
149 print(f"⏱️ Time taken: {end_time - start_time:.2f} seconds")
150 print(f"🌐 Pages crawled: {len(self.visited_urls)}")
151 print(f"🔗 Links checked: {len(self.checked_links)}")
152 print(f"❌ Broken links found: {len(self.dead_links)}")
154 if self.dead_links:
155 print("\n💥 BROKEN LINKS DETAILS:")
156 print("-" * 40)
157 for link_info in self.dead_links:
158 print(f"URL: {link_info['url']}")
159 print(f"Status: {link_info['status']} - {link_info['reason']}")
160 print(f"Found on: {link_info['found_on']}")
161 print("-" * 40)
163 print("\n📄 PAGES WITH BROKEN LINKS:")
164 for page, broken_links in self.broken_links.items():
165 print(f"\n{page}:")
166 for link in broken_links:
167 print(f" ❌ {link}")
168 else:
169 print("\n🎉 No broken links found! All links are working correctly.")
171 return len(self.dead_links) == 0
174def main():
175 """Main entry point."""
176 import argparse
178 parser = argparse.ArgumentParser(description="Check website for broken links")
179 parser.add_argument(
180 "--url", default="http://localhost:8000", help="Base URL to check"
181 )
182 parser.add_argument("--depth", type=int, default=3, help="Maximum crawl depth")
183 parser.add_argument(
184 "--external", action="store_true", help="Also check external links"
185 )
187 args = parser.parse_args()
189 checker = LinkChecker(args.url, args.depth)
191 try:
192 success = checker.run_check()
193 sys.exit(0 if success else 1)
194 except KeyboardInterrupt:
195 print("\n🛑 Link check interrupted by user")
196 sys.exit(1)
197 except Exception as e:
198 print(f"\n❌ Link check failed: {e}")
199 sys.exit(1)
202if __name__ == "__main__":
203 main()