-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsearch_scrape_and_parse_articles.py
76 lines (68 loc) · 2.68 KB
/
search_scrape_and_parse_articles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import requests
from newspaper import Article
import time
import unidecode
# define using se scraper and parsing with newspaper 3k
class Object(object):
pass
def gsearch(
term,
pages=1
): # run with sudo docker run -p 3000:3000 tschachn/se-scraper:latest
term = term.replace("\"", "").replace("\'", "").replace("-", "")
term += " -filetype:pdf" #TODO: be able to read pdfs
data = '{ "browser_config": { "random_user_agent": true },"scrape_config": { "search_engine": "google","keywords": ["' + term + '"],"num_pages": ' + str(
pages) + '}}'
header = {'Content-Type': 'application/json'}
results = requests.post("http://0.0.0.0:3000", data=data,
headers=header).json()
output = []
for num in range(int(pages)):
if str(num + 1) in results['results'][term]:
output.extend(
[i for i in results['results'][term][str(num + 1)]['results']])
return output
def search(query, link=""):
unaccented_string = unidecode.unidecode(query)
query_outs = gsearch(unaccented_string)
titles = []
labels = []
para = []
rm = []
outputs = [Object() for _ in query_outs]
for i, qo in enumerate(query_outs):
query = outputs[i]
query.link = qo['link']
# if not query.link or query.link.replace('/', u'\u2215') == link[0]: # the links have '/' replaced
# rm.append(i)
# continue
art = Article(query.link) # if you are an article, parse and
try:
start = time.time()
art.download()
if not art.html:
rm.append(i)
continue
print(query.link)
print("took " + str(time.time() - start))
art.parse() # some articles cannot be parsed
para.append(
art.text.split("\n\n")) # this makes paras a list of lists
toadd = query.link.replace('/', u'\u2215')
if len(query.link) > 255:
toadd = toadd[:255]
labels.append(
toadd) # titles are the labels that things are saved by
query.title = art.title
if art.title == "":
query.title = qo[i][
'title'] # this is just going to be what the title was on google
query.sources = art.source_url
query.authors = art.authors
if art.publish_date:
query.date = art.publish_date.strftime('%m/%d/%Y')
except Exception as e:
print(e) # if u cant load it uhhhh just dont do anything
for index in sorted(rm, reverse=True):
del outputs[index]
return outputs, labels, para, titles