-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWebscrapping_Script_MouseCellAtlas.py
74 lines (55 loc) · 2.4 KB
/
Webscrapping_Script_MouseCellAtlas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#Import all libraries required for the Script
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
#One can change the tissue type by manually entering the tissue name below, eg. Adult-Brain, Bone-Marrow, Retina etc.
driver.get('http://bis.zju.edu.cn/MCA/gallery.html?tissue=Adult-Brain')
#Open an output file
dataFile = open("AdultBrian_MouseCellAtlas.txt", "w")
try:
element = WebDriverWait(driver, 30).until(
EC.presence_of_element_located((By.CLASS_NAME, "sorting_1"))
)
finally:
#Set xpath to marker table length, i.e, setting number of entries
xpath = '//*[@id=\"marker_table_length\"]/label/select/option[4]'
driver.execute_script("document.evaluate(arguments[0], document, null, XPathResult.FIRST_ORDERED_NODE_TYPE, null).singleNodeValue.value = arguments[1];", xpath, "10000")
#Search for Marker table length in the body of the webpage and set the value to highest
select = Select(driver.find_element_by_name('marker_table_length'))
select.select_by_value('10000') # select by value
#Wait for 10 seconds before loading all values
WebDriverWait( driver, 10 ) # seconds
#Use HTML parser to parse through the page
soup = BeautifulSoup(driver.page_source, 'html.parser')
##Script starts writing an output of whatever it finds in the body of the html page
#Write the header ouput to a file
dataFile.write("### columns from Head ###")
dataFile.write("\n")
data = soup.find('thead').findAll('th')
header = ""
for tag in data :
header = header + " " + tag.text
dataFile.write(header)
dataFile.write("\n")
dataFile.write("\n")
#Append the body output to the file
dataFile.write("### data from body ###")
dataFile.write("\n")
## for the first time
tbody = soup.find('tbody').findAll('tr')
##loop over the tbody and write each row as text
for tr in tbody :
row = ""
td = tr.findAll('td')
dataFile.write("\n")
for each in td :
row = row + " " + each.text
#Write the output
dataFile.write(row)
#exit the driver and datafile
driver.close()
dataFile.close()