-
Notifications
You must be signed in to change notification settings - Fork 0
/
selenium.py
112 lines (68 loc) · 2.46 KB
/
selenium.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/python
import subprocess
import json
import os
import re
import sys
from pyvirtualdisplay import Display
from selenium import webdriver
from sh import cd, ls
import time
from tqdm import tqdm
from termcolor import colored
import wget
from bs4 import BeautifulSoup
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import pandas as pd
import openpyxl as op
from urlparse import urlparse
parent_url = "site url"
login_page ="Login page url"
username_filed_id = "id of the user name filed of the site"
password_filed_id = "id of the password filed of the site"
username = "user name for the logging"
password ="password"
domain = urlparse(parent_url).netloc
# print domain
# crawl_site recusively
def crawl_site(urls_All, urlsCrwld, driver, url):
urlsCrwld.append(url)
driver.get(url)
if(url == login_page): #login to the page
inputElement = driver.find_element_by_id(username_filed_id)
inputElement.send_keys(username)
inputElement = driver.find_element_by_id(password_filed_id)
inputElement.send_keys(password)
inputElement.send_keys(Keys.ENTER)
html = driver.page_source.encode("utf-8")
soup = BeautifulSoup(html) #creating tee from html page
urls = soup.findAll("a")
for a in urls:
if (a.get("href")) and (a.get("href") not in urls_All):
urls_All.append(a.get("href"))
for page in set(urls_All):
print page
if (urlparse(page).netloc == domain) and (page not in urlsCrwld): # wiill not add to the list if it is alredy there in the list of not in the same domain
# print this_url
crawl_site(urls_All, urlsCrwld, driver, page)
else:
return urlsCrwld, urls_All
if __name__ == "__main__":
#Maximize the web window
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(chrome_options=options)
urls_All = list()
urlsCrwld = list()
urls_All.append(parent_url)
urlsCrwld, urls_All = crawl_site(urls_All, urlsCrwld,
driver, parent_url)
#quit the browser
driver.quit()
print "FULL URLs LIST"
print len(set(urls_All))
print "CRAWLED URLs LIST"
print len(set(urlsCrwld))