-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbroken.py
155 lines (126 loc) · 5.84 KB
/
broken.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#This script will go through using os, the base directory you provide in input once you run the code, and checking for internal (local) non existing links
# or external (websites) non existing links. If it finds any it will print the file and the link that is broken
#Needed libraries to run the code are: os, re (although these are generally already installed with python), requests, beautifulsoup4
#
#To insTall the required packages you must insert in your terminal the following command:
# pip install beautifulsoup4 requests
#
# In order to use the script you must run the script and provide the path of the directory that your website is located
# tip: use vscode as compiler
import os
import re
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
from typing import Set, Dict, List
class BrokenLinkChecker:
def __init__(self, base_directory: str):
self.base_directory = os.path.abspath(base_directory)
self.found_links: Set[str] = set()
self.broken_links: Dict[str, List[str]] = {} # Maps files to their broken links
self.session = requests.Session()
self.session.headers.update({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
})
def is_internal_link(self, url: str) -> bool:
"""Check if a URL is internal (relative) or external."""
return not bool(urlparse(url).netloc)
def verify_external_link(self, url: str) -> bool:
"""Verify if an external URL is accessible."""
try:
response = self.session.head(url, timeout=10)
if response.status_code == 405: # Method not allowed
response = self.session.get(url, timeout=10)
return response.status_code < 400
except Exception:
return False
def verify_internal_link(self, link: str, source_file: str) -> bool:
"""Verify if an internal link points to an existing file."""
# Remove fragment identifier if present
link = link.split('#')[0]
if not link: # Empty after removing fragment
return True
# Convert the link to an absolute file path
source_dir = os.path.dirname(source_file)
if link.startswith('/'):
target_path = os.path.join(self.base_directory, link.lstrip('/'))
else:
target_path = os.path.join(source_dir, link)
target_path = os.path.normpath(target_path)
# Check if the target file exists
if os.path.exists(target_path):
return True
# If the path doesn't have an extension, try adding .html
if not os.path.splitext(target_path)[1]:
return os.path.exists(target_path + '.html')
return False
def check_file(self, filepath: str) -> None:
"""Check a single HTML file for broken links."""
try:
with open(filepath, 'r', encoding='utf-8') as f:
content = f.read()
soup = BeautifulSoup(content, 'html.parser')
broken_links = []
# Find all links in the file
for tag in soup.find_all(['a', 'img', 'script', 'link']):
# Get the appropriate attribute based on tag type
url = tag.get('href') or tag.get('src')
if not url:
continue
# Skip already checked links and special URLs
if url in self.found_links or url.startswith(('mailto:', 'tel:', 'javascript:')):
continue
self.found_links.add(url)
# Verify the link
is_broken = False
if self.is_internal_link(url):
if not self.verify_internal_link(url, filepath):
is_broken = True
else:
if not self.verify_external_link(url):
is_broken = True
if is_broken:
broken_links.append(url)
if broken_links:
rel_path = os.path.relpath(filepath, self.base_directory)
self.broken_links[rel_path] = broken_links
except Exception as e:
print(f"Error processing {filepath}: {str(e)}")
def scan_website(self) -> None:
"""Scan the entire website directory for broken links."""
html_files = []
# Find all HTML files
for root, _, files in os.walk(self.base_directory):
for file in files:
if file.endswith('.html'):
html_files.append(os.path.join(root, file))
# Process files in parallel
with ThreadPoolExecutor(max_workers=10) as executor:
executor.map(self.check_file, html_files)
def print_results(self) -> None:
"""Print the results of the broken link check."""
if not self.broken_links:
print("No broken links found!")
return
print("\nBroken Links Report:")
print("-" * 50)
for file, links in self.broken_links.items():
print(f"\nFile: {file}")
for link in links:
print(f" - {link}")
total_broken = sum(len(links) for links in self.broken_links.values())
print(f"\nTotal broken links found: {total_broken}")
print(f"Number of files with broken links: {len(self.broken_links)}")
def main():
# Get the website directory from user input
website_dir = input("Enter the path to your local website directory: ").strip()
if not os.path.exists(website_dir):
print("Error: Directory does not exist!")
return
print("Scanning for broken links... This may take a while.")
checker = BrokenLinkChecker(website_dir)
checker.scan_website()
checker.print_results()
if __name__ == "__main__":
main()