Skip to content

Commit f7cebb2

Browse files
committed
Added webpage class and webpage scraper
1 parent 7ed2893 commit f7cebb2

File tree

2 files changed

+133
-0
lines changed

2 files changed

+133
-0
lines changed

scraper.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from webpage import Webpage
2+
3+
from queue import Queue
4+
5+
class Scraper:
6+
@staticmethod
7+
def scrape_from_link(link, max_items=20):
8+
link_queue = Queue()
9+
link_queue.put(link)
10+
webpages = []
11+
passed_links={
12+
link: True
13+
} # I assume this is a map and has search complexity of O(log N) or O(1)
14+
15+
16+
while link_queue.qsize() > 0 and len(webpages) < max_items:
17+
link = link_queue.get()
18+
try: # In case page is invalid, skip it
19+
item = Webpage(link)
20+
connects = item.get_connects()
21+
except:
22+
#print(f"INVALID LINK: {link}")
23+
continue
24+
webpages.append(item)
25+
26+
for x in connects:
27+
if x in passed_links:
28+
continue
29+
passed_links[x] = True
30+
link_queue.put(x)
31+
32+
return webpages
33+

webpage.py

+100
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
import re
2+
import requests
3+
4+
import numpy as np
5+
6+
from bs4 import BeautifulSoup
7+
from typing import List
8+
from urllib.parse import urlparse, urljoin, urldefrag
9+
10+
class Webpage:
11+
# Class to store individual webpages
12+
13+
@staticmethod
14+
def fetch_html(url: str) -> str:
15+
# Gives html code of url
16+
#print(url)
17+
response = requests.get(url)
18+
response.raise_for_status()
19+
return response.text
20+
21+
@staticmethod
22+
def compare_domain(url, base_domain) -> bool:
23+
"""
24+
Compares the domain of a URL with the base domain.
25+
"""
26+
base_domain_len = len(base_domain.split('.'))
27+
28+
parsed_url = urlparse(url)
29+
domain_parts = parsed_url.netloc.split('.')
30+
31+
if len(domain_parts) < base_domain_len:
32+
return False
33+
34+
url_domain = ".".join(domain_parts[-base_domain_len:])
35+
36+
return url_domain == base_domain
37+
38+
@staticmethod
39+
def normalize_url(url: str) -> str:
40+
"""
41+
Normalizes the URL by removing the fragment and trailing slash.
42+
"""
43+
url = urldefrag(url)[0] # Remove the fragment
44+
if url.endswith('/'):
45+
url = url[:-1]
46+
return url
47+
48+
def __init__(self, link: str):
49+
self.link = link
50+
self.html = None
51+
self.embeddings = None
52+
53+
def set_embedding(self, embedding: List[np.ndarray]) -> None:
54+
self.embeddings=embedding
55+
56+
def get_text(self) -> str:
57+
# Returns text from webpage
58+
if self.html == None:
59+
self.html = Webpage.fetch_html(self.link)
60+
soup = BeautifulSoup(self.html, "html.parser")
61+
text = soup.get_text()
62+
text = re.sub(r'\n+', '\n', text)
63+
text = re.sub(r' +', ' ', text)
64+
return text.strip()
65+
66+
def get_connects(self) -> List[str]:
67+
html = Webpage.fetch_html(self.link)
68+
self.html = html
69+
soup = BeautifulSoup(html, "html.parser")
70+
base_domain = urlparse(self.link).netloc
71+
72+
connections = []
73+
74+
links = soup.find_all('a')
75+
76+
temp_link=self.link
77+
if temp_link[-1] != '/':
78+
temp_link += "/"
79+
80+
for link in links:
81+
href = link.get('href')
82+
if href:
83+
full_url = urljoin(temp_link, href)
84+
if Webpage.compare_domain(full_url, base_domain):
85+
connections.append(full_url)
86+
87+
if len(connections)==0:
88+
return []
89+
for i in range(len(connections)):
90+
connections[i] = Webpage.normalize_url(connections[i])
91+
connections.sort(key = lambda x: (len(x),x))
92+
93+
result=[]
94+
result.append(connections[0])
95+
for i in range(1, len(connections)):
96+
if connections[i] != connections[i - 1]:
97+
result.append(connections[i])
98+
99+
return result
100+

0 commit comments

Comments
 (0)