-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_check.py
208 lines (171 loc) · 7.93 KB
/
web_check.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
from typing import List, Tuple, Union
import requests
import time
import warnings
from urllib.parse import urljoin, urlparse
import argparse
from service_check_summarizer import summarize_service_check_output
from tts_utils import speak_text
from summary_utils import add_to_combined_summaries
# Suppress SSL warnings for unverified requests (since we're only testing reachability)
warnings.filterwarnings("ignore", message="Unverified HTTPS request")
# list_of_significant_websites to check
list_of_significant_websites: List[str] = [
"https://www.google.com",
"https://www.amazon.com",
"https://www.facebook.com",
"https://www.apple.com",
"https://www.microsoft.com",
"https://www.reddit.com",
"https://www.wikipedia.org",
"https://www.netflix.com",
"https://www.bbc.com",
"https://www.nytimes.com",
# Government list_of_significant_websites
"https://www.usa.gov", # US
"https://www.canada.ca", # Canada
"https://www.gob.mx", # Mexico
"https://www.gov.br", # Brazil
"https://www.gov.uk", # UK
"https://www.gouvernement.fr", # France
"https://www.bund.de", # Germany
"https://www.belgium.be", # Belgium
"https://www.australia.gov.au", # Australia
"https://www.india.gov.in", # India
# "https://www.gov.za", # South Africa (removed due to persistent connection issues)
"https://www.japan.go.jp", # Japan
# "https://www.korea.go.kr", # South Korea (removed due to persistent connection issues)
"https://www.gov.sg" # Singapore
]
# User agent to make requests appear like they come from a real browser
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
def check_website(url: str, verify_ssl: bool = False) -> Tuple[str, Union[float, str]]:
"""
Check if a website is reachable
Args:
url: The URL to check
verify_ssl: Whether to verify SSL certificates (default: False)
Returns:
tuple: (status, result) where status is 'reachable' or 'unreachable'
and result is either response time or error message
"""
try:
# Parse the URL correctly
parsed_url = urlparse(url)
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
robots_url = urljoin(base_url, "/robots.txt")
response = requests.get(robots_url, headers=headers, timeout=15, verify=verify_ssl)
if response.status_code == 200:
return "reachable", response.elapsed.total_seconds()
elif response.status_code == 404:
# Retry with root URL if robots.txt not found
response = requests.get(url, headers=headers, timeout=15, verify=verify_ssl)
if response.status_code == 200:
return "reachable", response.elapsed.total_seconds()
else:
return "unreachable", f"Status Code: {response.status_code}"
else:
return "unreachable", f"Status Code: {response.status_code}"
except requests.exceptions.Timeout:
return "unreachable", "Timeout reached"
except requests.exceptions.ConnectionError as e:
return "unreachable", str(e)
except Exception as e:
return "unreachable", str(e)
def check_significant_websites(
websites: List[str],
args: argparse.Namespace,
max_retries: int = 1,
retry_delay: int = 5
) -> Tuple[List[Tuple[str, float]], List[Tuple[str, str]]]:
"""
Check the reachability of multiple websites with configurable retry behavior
Args:
websites: List of website URLs to check
args: Command line arguments
max_retries: Maximum number of retry attempts (default: 1)
retry_delay: Delay between retries in seconds (default: 5)
"""
# Initialize lists to store reachable and unreachable websites
reachable_websites = []
unreachable_websites = []
# First round of checks
for url in websites:
status, result = check_website(url)
if status == "reachable":
reachable_websites.append((url, result))
else:
unreachable_websites.append((url, result))
# Retry logic with configurable attempts
retry_count = 0
while unreachable_websites and retry_count < max_retries:
retry_count += 1
# print(f"\nRetry attempt {retry_count} of {max_retries}...\n")
if not args.silent:
speak_text(f"Retrying unreachable websites, attempt {retry_count}...")
time.sleep(retry_delay)
remaining_unreachable = []
for url, error in unreachable_websites:
status, retry_result = check_website(url)
if status == "reachable":
reachable_websites.append((url, retry_result))
else:
remaining_unreachable.append((url, retry_result))
unreachable_websites = remaining_unreachable
return reachable_websites, unreachable_websites
def main(silent=False, polite=False) -> str:
"""
Check the reachability of significant websites
Args:
silent: Whether to disable text-to-speech output (default: False)
polite: Whether to use more polite phrasing in output (default: False)
Returns:
str: Summary of website checks
"""
# Only parse args if called from command line (with no arguments)
if silent is False and polite is False: # Default values, likely called from command line
parser = argparse.ArgumentParser(description='Check the reachability of significant websites')
parser.add_argument('--silent', action='store_true', help='Disable text-to-speech output')
parser.add_argument('--polite', action='store_true', help='Use more polite phrasing in output')
args = parser.parse_args()
else:
# Create args namespace for programmatic calls
args = argparse.Namespace(silent=silent, polite=polite)
intro_statement = (
"Initiating connectivity checks on several major technology provider websites and selected government websites, "
"verifying their current reachability. Should one of these significant sites be fully unreachable, "
"it may suggest a broader infrastructural fault or a critical disruption in global online communications."
)
# print(f"{intro_statement}")
if not args.silent:
speak_text(f"{intro_statement}")
reachable, unreachable = check_significant_websites(list_of_significant_websites, args)
report_on_significant_websites = ""
# print("Reachable Websites:")
report_on_significant_websites += "Reachable Websites:\n"
for url, response_time in reachable:
# print(f"- {url}: Response Time: {response_time:.6f} seconds")
report_on_significant_websites += f"- {url}: Response Time: {response_time:.6f} seconds"
if len(unreachable) == 0:
all_reachable_statement = ("\nSummary of reachability of major tech and government websites:\nAll websites are "
"reachable.")
# print(f"{all_reachable_statement}")
report_on_significant_websites += all_reachable_statement
else:
# print("\nUnreachable Websites:")
report_on_significant_websites += "\nUnreachable Websites:\n"
for url, error in unreachable:
# print(f"- {url}: {error}")
report_on_significant_websites += f"- {url}: {error}"
significant_website_checks_summary = summarize_service_check_output(report_on_significant_websites)
# print(f"{significant_website_checks_summary}")
# Add the summary to the combined summaries
add_to_combined_summaries(significant_website_checks_summary)
if not args.silent:
speak_text( "The summary of checking significant websites is as follows:")
speak_text(f"{significant_website_checks_summary}")
return significant_website_checks_summary
if __name__ == "__main__":
main()