-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathwebcrawler.py
More file actions
26 lines (23 loc) · 792 Bytes
/
webcrawler.py
File metadata and controls
26 lines (23 loc) · 792 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
from requests_html import HTMLSession
from bs4 import BeautifulSoup
def getLinks(url):
links = []
session = HTMLSession()
response = session.get(url)
soup = BeautifulSoup(response.text,'lxml')
for link in soup.find_all('a',href=True):
if(link['href'].startswith('./')):
link['href'] = url + link['href']
if(link['href'].startswith('/')):
link['href'] = url + link['href']
if(link['href'].startswith('#')):
continue
if(link['href'].startswith('http')):
links.append(link['href'])
print('-------------------------------------')
print("Crawling the target website.....")
print("Links presesnt on this website-")
i=0
for link in links:
print(link)
return links