-
Notifications
You must be signed in to change notification settings - Fork 38
/
Copy pathexample_search_DE_literature.py
115 lines (101 loc) · 5.54 KB
/
example_search_DE_literature.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import os
import json
import urllib.request
from multiprocessing import cpu_count
# You will need to install bs4 (python -m pip install bs4)
from bs4 import BeautifulSoup
import holmes_extractor as holmes
# You will need to install falcon (python -m pip install falcon)
import falcon
if __name__ in ('__main__', 'example_search_DE_literature'):
working_directory = # REPLACE WITH PATH TO WORKING DIRECTORY IN SINGLE OR DOUBLE QUOTES
HOLMES_EXTENSION = 'hdc'
flag_filename = os.sep.join((working_directory, 'STORY_PARSING_COMPLETE'))
print('Initializing Holmes (this may take some time) ...')
# Start the Holmes manager with the German model
holmes_manager = holmes.Manager(
model='de_core_news_lg')
def process_documents_from_front_page(front_page_uri, front_page_label):
""" Download and save all the stories from a front page."""
front_page = urllib.request.urlopen(front_page_uri)
front_page_soup = BeautifulSoup(front_page, 'html.parser')
document_texts = []
labels = []
# For each story ...
for anchor in front_page_soup.find_all('a'):
if not anchor['href'].startswith('/') and not anchor['href'].startswith('https'):
this_document_url = '/'.join((front_page_uri, anchor['href']))
print('Downloading story', anchor.contents[0], 'from front page', front_page_label)
# Get the HTML document for the story
this_document = urllib.request.urlopen(this_document_url)
# Extract the raw text from the HTML document
this_document_soup = BeautifulSoup(this_document, 'html.parser')
this_document_text = this_document_soup.prettify()
this_document_text = this_document_text.split('</h1>', 1)[1]
this_document_text = this_document_text.split('<span class="autor"', 1)[0]
this_document_text = this_document_text.replace('<br/>', ' ')
# Remove any carriage returns and line feeds from the raw text
this_document_text = this_document_text.replace(
'\n', ' ').replace('\r', ' ').replace(' ', ' ')
# Replace multiple spaces with single spaces
this_document_text = ' '.join(this_document_text.split())
# Create a document label from the front page label and the story name
this_document_label = ' - '.join((front_page_label, anchor.contents[0]))
document_texts.append(this_document_text)
labels.append(this_document_label)
parsed_documents = holmes_manager.nlp.pipe(document_texts, n_process=cpu_count())
for index, parsed_document in enumerate(parsed_documents):
label = labels[index]
print('Saving', label)
output_filename = os.sep.join((working_directory, label))
output_filename = '.'.join((output_filename, HOLMES_EXTENSION))
with open(output_filename, "wb") as file:
file.write(parsed_document.to_bytes())
def load_documents_from_working_directory():
serialized_documents = {}
for file in os.listdir(working_directory):
if file.endswith(HOLMES_EXTENSION):
print('Loading', file)
label = file[:-4]
long_filename = os.sep.join((working_directory, file))
with open(long_filename, "rb") as file:
contents = file.read()
serialized_documents[label] = contents
print('Indexing documents (this may take some time) ...')
holmes_manager.register_serialized_documents(serialized_documents)
if os.path.exists(working_directory):
if not os.path.isdir(working_directory):
raise RuntimeError(' '.join((working_directory, 'must be a directory')))
else:
os.mkdir(working_directory)
if os.path.isfile(flag_filename):
load_documents_from_working_directory()
else:
process_documents_from_front_page(
"https://maerchen.com/grimm/", 'Gebrüder Grimm')
process_documents_from_front_page(
"https://maerchen.com/grimm2/", 'Gebrüder Grimm')
process_documents_from_front_page(
"https://maerchen.com/andersen/", 'Hans Christian Andersen')
process_documents_from_front_page(
"https://maerchen.com/bechstein/", 'Ludwig Bechstein')
process_documents_from_front_page(
"https://maerchen.com/wolf/", 'Johann Wilhelm Wolf')
# Generate flag file to indicate files can be reloaded on next run
open(flag_filename, 'a').close()
load_documents_from_working_directory()
#Comment following line in to activate interactive console
#holmes_manager.start_topic_matching_search_mode_console(only_one_result_per_document=True)
# The following code starts a RESTful Http service to perform topic searches. It is deployed as
# as WSGI application. An example of how to start it - issued from the directory that
# contains the script - is
# python -m waitress example_search_DE_literature:application
# You will need to install waitress (python -m pip install waitress)
class RestHandler():
def on_get(self, req, resp):
resp.text = \
json.dumps(holmes_manager.topic_match_documents_against(
req.params['entry'][0:200], only_one_result_per_document=True))
resp.cache_control = ["s-maxage=31536000"]
application = falcon.App()
application.add_route('/german', RestHandler())