-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtest_cases.py
68 lines (54 loc) · 1.78 KB
/
test_cases.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
As of v0.1.3, this file holds some more or less naive test cases,
which can also be viewed as examples for using the framework.
Some future version will have a more sophisticated test suite, i.e.
by integrating tests for the webservice.
"""
import pytest
from microwler import Microwler, scrape
from microwler.export import JSONExporter, HTMLExporter
@pytest.mark.asyncio
def test_basic():
crawler = Microwler('https://quotes.toscrape.com/')
crawler.run(verbose=True)
@pytest.mark.asyncio
def test_intermediate():
selectors = {
'title': scrape.title,
'headings': scrape.headings,
}
settings = {
'max_depth': 5,
'max_concurrency': 30,
}
crawler = Microwler('https://quotes.toscrape.com/', select=selectors, settings=settings)
crawler.run(verbose=True)
@pytest.mark.asyncio
def test_advanced():
selectors = {
'title': scrape.title,
'headings': scrape.headings,
'paragraphs': scrape.paragraphs,
# Define custom selectors using Parsel
'images': lambda dom: [img.attrib['src'] for img in dom.css('img').getall()]
}
settings = {
'link_filter': "//a[contains(@href, 'inspirational')]/@href",
'max_depth': 10,
'max_concurrency': 15,
'export_to': './tests/exports',
'exporters': [JSONExporter, HTMLExporter],
'caching': True,
}
def transformer(data: dict):
""" Define a transformer to manipulate your scraped data """
data['title'] = data['title'].upper()
data['paragraphs'] = len(data['paragraphs'])
return data
crawler = Microwler(
'https://quotes.toscrape.com/',
select=selectors,
transform=transformer,
settings=settings
)
crawler.run(verbose=True, sort_urls=True)