Skip to content

Commit 8ed15c5

Browse files
authored
Merge pull request #49 from Mews/transient-error-retry
Feature: Retry mechanism for transient errors
2 parents d52dfa1 + 1fe33fa commit 8ed15c5

File tree

5 files changed

+165
-9
lines changed

5 files changed

+165
-9
lines changed

src/tiny_web_crawler/core/spider.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ def crawl(self, url: str) -> None:
8585
return
8686

8787
logger.debug("Crawling: %s", url)
88-
soup = fetch_url(url)
88+
soup = fetch_url(url, retries=self.settings.max_retry_attempts)
8989
if not soup:
9090
return
9191

src/tiny_web_crawler/core/spider_settings.py

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ class CrawlSettings:
4040
internal_links_only: bool = False
4141
external_links_only: bool = False
4242
respect_robots_txt: bool = True
43+
max_retry_attempts: int = 5
4344

4445
@dataclass
4546
class SpiderSettings(GeneralSettings, CrawlSettings):

src/tiny_web_crawler/networking/fetcher.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,32 @@
11
from typing import Optional
2+
import time
23

34
import requests
45
from bs4 import BeautifulSoup
56

67
from tiny_web_crawler.logging import get_logger
78

9+
TRANSIENT_ERRORS = [408, 502, 503, 504]
10+
811
logger = get_logger()
912

10-
def fetch_url(url: str) -> Optional[BeautifulSoup]:
13+
def is_transient_error(status_code: int) -> bool:
14+
return status_code in TRANSIENT_ERRORS
15+
16+
def fetch_url(url: str, retries: int, attempts: int = 0) -> Optional[BeautifulSoup]:
1117
try:
1218
response = requests.get(url, timeout=10)
1319
response.raise_for_status()
1420
data = response.text
1521
return BeautifulSoup(data, 'lxml')
1622
except requests.exceptions.HTTPError as http_err:
23+
if response.status_code and is_transient_error(response.status_code) and retries > 0:
24+
logger.error("Transient HTTP error occurred: %s. Retrying...", http_err)
25+
time.sleep( attempts+1 )
26+
return fetch_url( url, retries-1 , attempts+1)
27+
1728
logger.error("HTTP error occurred: %s", http_err)
29+
return None
1830
except requests.exceptions.ConnectionError as conn_err:
1931
logger.error("Connection error occurred: %s", conn_err)
2032
except requests.exceptions.Timeout as timeout_err:

tests/networking/test_fetcher.py

+92-6
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from unittest.mock import patch
12

23
import responses
34
import requests
@@ -15,7 +16,7 @@ def test_fetch_url() -> None:
1516
status=200
1617
)
1718

18-
resp = fetch_url("http://example.com")
19+
resp = fetch_url("http://example.com", 1)
1920

2021
assert resp is not None
2122
assert resp.text == "link"
@@ -26,15 +27,15 @@ def test_fetch_url_connection_error(caplog) -> None: # type: ignore
2627

2728
with caplog.at_level(ERROR):
2829
# Fetch url whose response isn't mocked to raise ConnectionError
29-
resp = fetch_url("http://connection.error")
30+
resp = fetch_url("http://connection.error", 1)
3031

3132
assert "Connection error occurred:" in caplog.text
3233
assert resp is None
3334

3435

3536
@responses.activate
3637
def test_fetch_url_http_error(caplog) -> None: # type: ignore
37-
error_codes = [403, 404, 408]
38+
error_codes = [403, 404, 412]
3839

3940
for error_code in error_codes:
4041
setup_mock_response(
@@ -44,7 +45,7 @@ def test_fetch_url_http_error(caplog) -> None: # type: ignore
4445
)
4546

4647
with caplog.at_level(ERROR):
47-
resp = fetch_url(f"http://http.error/{error_code}")
48+
resp = fetch_url(f"http://http.error/{error_code}", 1)
4849

4950
assert "HTTP error occurred:" in caplog.text
5051
assert resp is None
@@ -60,7 +61,7 @@ def test_fetch_url_timeout_error(caplog) -> None: # type: ignore
6061

6162
with caplog.at_level(ERROR):
6263
# Fetch url whose response isn't mocked to raise ConnectionError
63-
resp = fetch_url("http://timeout.error")
64+
resp = fetch_url("http://timeout.error", 1)
6465

6566
assert "Timeout error occurred:" in caplog.text
6667
assert resp is None
@@ -76,7 +77,92 @@ def test_fetch_url_requests_exception(caplog) -> None: # type: ignore
7677

7778
with caplog.at_level(ERROR):
7879
# Fetch url whose response isn't mocked to raise ConnectionError
79-
resp = fetch_url("http://requests.exception")
80+
resp = fetch_url("http://requests.exception", 1)
8081

8182
assert "Request error occurred:" in caplog.text
8283
assert resp is None
84+
85+
86+
@patch("time.sleep")
87+
@responses.activate
88+
def test_fetch_url_transient_error_retry_5(mock_sleep, caplog) -> None: # type: ignore
89+
setup_mock_response(
90+
url="http://transient.error",
91+
body="<html><body><a href='http://transient.error'>link</a></body></html>",
92+
status=503
93+
)
94+
95+
max_retry_attempts = 5
96+
97+
with caplog.at_level(ERROR):
98+
resp = fetch_url("http://transient.error", max_retry_attempts)
99+
100+
assert resp is None
101+
102+
# Assert url was fetched once then retried x ammount of times
103+
assert len(responses.calls) == max_retry_attempts + 1
104+
105+
# Assert sleep time grew with every request
106+
expected_delays = [1, 2, 3, 4, 5]
107+
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
108+
assert actual_delays == expected_delays
109+
110+
assert "Transient HTTP error occurred:" in caplog.text
111+
112+
113+
@patch("time.sleep")
114+
@responses.activate
115+
def test_fetch_url_transient_error_retry_10(mock_sleep, caplog) -> None: # type: ignore
116+
setup_mock_response(
117+
url="http://transient.error",
118+
body="<html><body><a href='http://transient.error'>link</a></body></html>",
119+
status=503
120+
)
121+
122+
max_retry_attempts = 10
123+
124+
with caplog.at_level(ERROR):
125+
resp = fetch_url("http://transient.error", max_retry_attempts)
126+
127+
assert resp is None
128+
129+
# Assert url was fetched once then retried x ammount of times
130+
assert len(responses.calls) == max_retry_attempts + 1
131+
132+
# Assert sleep time grew with every request
133+
expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
134+
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
135+
assert actual_delays == expected_delays
136+
137+
assert "Transient HTTP error occurred:" in caplog.text
138+
139+
140+
@patch("time.sleep")
141+
@responses.activate
142+
def test_fetch_url_transient_error_retry_success(mock_sleep, caplog) -> None: # type: ignore
143+
setup_mock_response(
144+
url="http://transient.error",
145+
body="<html><body><a href='http://transient.error'>link</a></body></html>",
146+
status=503
147+
)
148+
setup_mock_response(
149+
url="http://transient.error",
150+
body="<html><body><a href='http://transient.error'>link</a></body></html>",
151+
status=200
152+
)
153+
154+
max_retry_attempts = 1
155+
156+
with caplog.at_level(ERROR):
157+
resp = fetch_url("http://transient.error", max_retry_attempts)
158+
159+
assert resp is not None
160+
assert resp.text == "link"
161+
162+
# Assert url was fetched 2 times
163+
assert len(responses.calls) == 2
164+
165+
# Assert time.sleep was called
166+
mock_sleep.assert_called_once_with(1)
167+
168+
assert "Transient HTTP error occurred:" in caplog.text

tests/test_crawler.py

+58-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
from tiny_web_crawler import Spider
1010
from tiny_web_crawler import SpiderSettings
11-
from tiny_web_crawler.logging import DEBUG, WARNING
11+
from tiny_web_crawler.logging import DEBUG, WARNING, ERROR
1212
from tests.utils import setup_mock_response
1313

1414
@responses.activate
@@ -490,3 +490,60 @@ def test_respect_robots_txt_crawl_delay(mock_sleep, mock_urlopen, caplog) -> Non
490490
def test_crawl_no_root_url() -> None:
491491
with pytest.raises(ValueError):
492492
Spider(SpiderSettings(verbose=False))
493+
494+
495+
@patch("time.sleep")
496+
@responses.activate
497+
def test_crawl_url_transient_retry(mock_sleep, caplog) -> None: # type: ignore
498+
setup_mock_response(
499+
url="http://transient.error",
500+
body="<html><body><a href='http://transient.error'>link</a></body></html>",
501+
status=503
502+
)
503+
504+
spider = Spider(
505+
SpiderSettings(root_url="http://transient.error",
506+
respect_robots_txt=False)
507+
)
508+
509+
with caplog.at_level(ERROR):
510+
spider.crawl("http://transient.error")
511+
512+
assert spider.crawl_result == {}
513+
514+
assert len(responses.calls) == 6
515+
516+
expected_delays = [1, 2, 3, 4, 5]
517+
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
518+
assert actual_delays == expected_delays
519+
520+
assert "Transient HTTP error occurred:" in caplog.text
521+
522+
523+
@patch("time.sleep")
524+
@responses.activate
525+
def test_crawl_url_transient_retry_custom_retry_amount(mock_sleep, caplog) -> None: # type: ignore
526+
setup_mock_response(
527+
url="http://transient.error",
528+
body="<html><body><a href='http://transient.error'>link</a></body></html>",
529+
status=503
530+
)
531+
532+
spider = Spider(
533+
SpiderSettings(root_url="http://transient.error",
534+
max_retry_attempts=10,
535+
respect_robots_txt=False)
536+
)
537+
538+
with caplog.at_level(ERROR):
539+
spider.crawl("http://transient.error")
540+
541+
assert spider.crawl_result == {}
542+
543+
assert len(responses.calls) == 11
544+
545+
expected_delays = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
546+
actual_delays = [call.args[0] for call in mock_sleep.call_args_list]
547+
assert actual_delays == expected_delays
548+
549+
assert "Transient HTTP error occurred:" in caplog.text

0 commit comments

Comments
 (0)