-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfind_redirects_solar_urls.py
64 lines (50 loc) · 1.86 KB
/
find_redirects_solar_urls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
"""
sinequa does not handle redirects well, needed a python script to find the actual url
and any better titles that might be available. this particular script runs on a csv
containing solar urls
"""
import csv
import json
import bs4 as BeautifulSoup
import requests
def csv_to_dict_list(file_path):
with open(file_path, encoding="utf-8") as file:
dict_reader = csv.DictReader(file)
list_of_dicts = list(dict_reader)
return list_of_dicts
file_path = "solar_urls.csv"
urls = csv_to_dict_list(file_path)
# filter out unwanted urls
# this removes 10.5k / 71k urls
urls = [u for u in urls if "unwanted" not in u["type"]]
urls = [u for u in urls if "contact" not in u["url"]]
urls = [u for u in urls if ".amp" not in u["url"]]
processed_urls = []
for index, url_data in enumerate(urls):
url = url_data["url"]
title = url_data["old title"]
try:
response = requests.get(url, allow_redirects=True, timeout=5)
response_url = response.url if response.history else url
try:
soup = BeautifulSoup(response.content, "html.parser")
scraped_title = soup.find("title").text.strip() if soup.find("title") else ""
except (AssertionError, Exception) as parse_error:
scraped_title = ""
print(f"Error parsing URL {url_info['url']}: {parse_error}") # noqa: F821
except requests.RequestException as e:
print(f"Error fetching URL {url_info['url']}: {e}") # noqa: F821
response_url = ""
scraped_title = ""
processed_urls.append(
{
"og_url": url,
"final_url": response_url,
"og_title": title,
"scraped_title": scraped_title,
}
)
if index % 1000 == 0:
print(f"Processed {index} URLs.")
json.dump(processed_urls, open(f"solar_urls/{index}.json", "w"))
processed_urls = []