-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper.py
68 lines (51 loc) · 2.35 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import requests
import string
import os
from bs4 import BeautifulSoup
website = "https://www.nature.com/nature/articles"
def main():
saved_articles = []
print("Number of pages:")
pages = int(input())
print("Type of article:")
article_type = input()
for i in range(1, pages + 1):
req = requests.get(website, params={'page': i})
dir_name = 'Page_' + str(i)
if not os.path.exists(dir_name):
os.mkdir(dir_name)
if req.status_code == 200:
soup = BeautifulSoup(req.content, "html.parser")
articles = soup.find_all('article') # find all articles
for article in articles:
span_tags = article.find_all("span", {"class": "c-meta__type"})
for span in span_tags:
# check article type
if span.text == article_type:
# get link of article
link = article.find('a').get('href')
r = requests.get("https://www.nature.com" + link)
soup = BeautifulSoup(r.content, "html.parser")
# format title
title = soup.find('meta', {"property": "og:title"}).get('content') # get title of article
title = title.translate(str.maketrans('', '', string.punctuation)) # replace punctuation
title = title.replace(' ', '_') # remove whitespace
# extract news body
body = soup.find('div', {"class": "c-article-body"})
if body is None:
body = soup.find('div', {"class": "article-item__body"})
if body is None:
body = soup.find('article')
body = body.text.strip()
# save article to .txt file
txt_file = title + '.txt'
filename = os.path.join(dir_name, txt_file)
file = open(filename, 'wb')
file.write(body.encode('utf-8'))
saved_articles.append(txt_file)
file.close()
else:
print("The URL returned " + str(req.status_code) + "!")
print("Saved all articles.")
if __name__ == "__main__":
main()