diff --git a/docs/examples/code_examples/respect_robots_on_skipped_request.py b/docs/examples/code_examples/respect_robots_on_skipped_request.py new file mode 100644 index 0000000000..5c7eca173f --- /dev/null +++ b/docs/examples/code_examples/respect_robots_on_skipped_request.py @@ -0,0 +1,36 @@ +import asyncio + +from crawlee import SkippedReason +from crawlee.crawlers import ( + BeautifulSoupCrawler, + BeautifulSoupCrawlingContext, +) + + +async def main() -> None: + # Initialize the crawler with robots.txt compliance enabled + crawler = BeautifulSoupCrawler(respect_robots_txt_file=True) + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + + # highlight-start + # This handler is called when a request is skipped + @crawler.on_skipped_request + async def skipped_request_handler(url: str, reason: SkippedReason) -> None: + # Check if the request was skipped due to robots.txt rules + if reason == 'robots_txt': + crawler.log.info(f'Skipped {url} due to robots.txt rules.') + + # highlight-end + + # Start the crawler with the specified URLs + # The login URL will be skipped and handled by the skipped_request_handler + await crawler.run( + ['https://news.ycombinator.com/', 'https://news.ycombinator.com/login'] + ) + + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/docs/examples/respect_robots_txt_file.mdx b/docs/examples/respect_robots_txt_file.mdx index 5f6194c919..dc509e16b8 100644 --- a/docs/examples/respect_robots_txt_file.mdx +++ b/docs/examples/respect_robots_txt_file.mdx @@ -7,6 +7,7 @@ import ApiLink from '@site/src/components/ApiLink'; import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock'; import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py'; +import OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py'; This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file. @@ -19,3 +20,13 @@ The code below demonstrates this behavior using the {RespectRobotsTxt} + +## Handle with `on_skipped_request` + +If you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from `BasicCrawler`. + +Let's update the code by adding the `on_skipped_request` handler: + + + {OnSkippedRequest} + diff --git a/src/crawlee/__init__.py b/src/crawlee/__init__.py index 86bd6ca2ea..d8433a5e96 100644 --- a/src/crawlee/__init__.py +++ b/src/crawlee/__init__.py @@ -2,7 +2,7 @@ from ._request import Request, RequestOptions from ._service_locator import service_locator -from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction +from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason from ._utils.globs import Glob __version__ = metadata.version('crawlee') @@ -15,5 +15,6 @@ 'Request', 'RequestOptions', 'RequestTransformAction', + 'SkippedReason', 'service_locator', ] diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index c68ae63df9..1764f658a3 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -50,6 +50,8 @@ EnqueueStrategy: TypeAlias = Literal['all', 'same-domain', 'same-hostname', 'same-origin'] """Enqueue strategy to be used for determining which links to extract and enqueue.""" +SkippedReason: TypeAlias = Literal['robots_txt'] + def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]: """Convert all header keys to lowercase, strips whitespace, and returns them sorted by key.""" diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 9abcb4c6f5..5d05098886 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import logging from abc import ABC from typing import TYPE_CHECKING, Any, Callable, Generic, Union @@ -157,6 +158,7 @@ async def extract_links( kwargs.setdefault('strategy', 'same-hostname') requests = list[Request]() + skipped = list[str]() base_user_data = user_data or {} robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url) @@ -168,8 +170,7 @@ async def extract_links( url = convert_to_absolute_url(base_url, url) if robots_txt_file and not robots_txt_file.is_allowed(url): - # TODO: https://github.com/apify/crawlee-python/issues/1160 - # add processing with on_skipped_request hook + skipped.append(url) continue request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label) @@ -192,6 +193,12 @@ async def extract_links( continue requests.append(request) + + if skipped: + skipped_tasks = [ + asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped + ] + await asyncio.gather(*skipped_tasks) return requests return extract_links diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 7e07c87f16..f13c125095 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -33,6 +33,7 @@ HttpHeaders, RequestHandlerRunResult, SendRequestFunction, + SkippedReason, ) from crawlee._utils.docs import docs_group from crawlee._utils.robots import RobotsTxtFile @@ -81,6 +82,7 @@ TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState) ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]] FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]] +SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]] class _BasicCrawlerOptions(TypedDict): @@ -335,9 +337,10 @@ def __init__( self._router = None self.router.default_handler(request_handler) - # Error & failed request handlers + # Error, failed & skipped request handlers self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None + self._on_skipped_request: SkippedRequestCallback | None = None self._abort_on_error = abort_on_error # Context of each request with matching result of request handler. @@ -540,6 +543,14 @@ def failed_request_handler( self._failed_request_handler = handler return handler + def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback: + """Register a function to handle skipped requests. + + The skipped request handler is invoked when a request is skipped due to a collision or other reasons. + """ + self._on_skipped_request = callback + return callback + async def run( self, requests: Sequence[str | Request] | None = None, @@ -676,8 +687,10 @@ async def add_requests( skipped.append(request) if skipped: - # TODO: https://github.com/apify/crawlee-python/issues/1160 - # add processing with on_skipped_request hook + skipped_tasks = [ + asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped + ] + await asyncio.gather(*skipped_tasks) self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file') request_manager = await self.get_request_manager() @@ -996,6 +1009,30 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling except Exception as e: raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e + async def _handle_skipped_request( + self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False + ) -> None: + if need_mark and isinstance(request, Request): + request_manager = await self.get_request_manager() + + await wait_for( + lambda: request_manager.mark_request_as_handled(request), + timeout=self._internal_timeout, + timeout_message='Marking request as handled timed out after ' + f'{self._internal_timeout.total_seconds()} seconds', + logger=self._logger, + max_retries=3, + ) + request.state = RequestState.SKIPPED + + url = request.url if isinstance(request, Request) else request + + if self._on_skipped_request: + try: + await self._on_skipped_request(url, reason) + except Exception as e: + raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e + def _get_message_from_error(self, error: Exception) -> str: """Get error message summary from exception. @@ -1152,16 +1189,8 @@ async def __run_task_function(self) -> None: self._logger.warning( f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt' ) - await wait_for( - lambda: request_manager.mark_request_as_handled(request), - timeout=self._internal_timeout, - timeout_message='Marking request as handled timed out after ' - f'{self._internal_timeout.total_seconds()} seconds', - logger=self._logger, - max_retries=3, - ) - # TODO: https://github.com/apify/crawlee-python/issues/1160 - # add processing with on_skipped_request hook + + await self._handle_skipped_request(request, 'robots_txt', need_mark=True) return if request.session_id: diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index dfa2f0dfcc..946645b585 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import logging from functools import partial from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union @@ -292,6 +293,7 @@ async def extract_links( kwargs.setdefault('strategy', 'same-hostname') requests = list[Request]() + skipped = list[str]() base_user_data = user_data or {} elements = await context.page.query_selector_all(selector) @@ -309,8 +311,7 @@ async def extract_links( url = convert_to_absolute_url(base_url, url) if robots_txt_file and not robots_txt_file.is_allowed(url): - # TODO: https://github.com/apify/crawlee-python/issues/1160 - # add processing with on_skipped_request hook + skipped.append(url) continue request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label}) @@ -334,6 +335,12 @@ async def extract_links( requests.append(request) + if skipped: + skipped_tasks = [ + asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped + ] + await asyncio.gather(*skipped_tasks) + return requests return extract_links diff --git a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py index b73ea4aeaa..4a0949b831 100644 --- a/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py +++ b/tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py @@ -3,7 +3,7 @@ from typing import TYPE_CHECKING from unittest import mock -from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction +from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction, SkippedReason from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext if TYPE_CHECKING: @@ -160,3 +160,26 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: str(server_url / 'start_enqueue'), str(server_url / 'sub_index'), } + + +async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: + crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True) + skip = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: BeautifulSoupCrawlingContext) -> None: + await context.enqueue_links() + + @crawler.on_skipped_request + async def skipped_hook(url: str, _reason: SkippedReason) -> None: + skip(url) + + await crawler.run([str(server_url / 'start_enqueue')]) + + skipped = {call[0][0] for call in skip.call_args_list} + + assert skipped == { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), + } diff --git a/tests/unit/crawlers/_parsel/test_parsel_crawler.py b/tests/unit/crawlers/_parsel/test_parsel_crawler.py index 1682c73739..5b66564920 100644 --- a/tests/unit/crawlers/_parsel/test_parsel_crawler.py +++ b/tests/unit/crawlers/_parsel/test_parsel_crawler.py @@ -6,7 +6,7 @@ import pytest -from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction +from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction, SkippedReason from crawlee.crawlers import ParselCrawler if TYPE_CHECKING: @@ -256,3 +256,26 @@ async def request_handler(context: ParselCrawlingContext) -> None: str(server_url / 'start_enqueue'), str(server_url / 'sub_index'), } + + +async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None: + crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True) + skip = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + await context.enqueue_links() + + @crawler.on_skipped_request + async def skipped_hook(url: str, _reason: SkippedReason) -> None: + skip(url) + + await crawler.run([str(server_url / 'start_enqueue')]) + + skipped = {call[0][0] for call in skip.call_args_list} + + assert skipped == { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), + } diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 35b663a567..b926d6c1d0 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -11,7 +11,7 @@ import pytest -from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction +from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction, SkippedReason from crawlee.crawlers import PlaywrightCrawler from crawlee.fingerprint_suite import ( DefaultFingerprintGenerator, @@ -618,6 +618,29 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: } +async def test_on_skipped_request(server_url: URL) -> None: + crawler = PlaywrightCrawler(respect_robots_txt_file=True) + skip = mock.Mock() + + @crawler.router.default_handler + async def request_handler(context: PlaywrightCrawlingContext) -> None: + await context.enqueue_links() + + @crawler.on_skipped_request + async def skipped_hook(url: str, _reason: SkippedReason) -> None: + skip(url) + + await crawler.run([str(server_url / 'start_enqueue')]) + + skipped = {call[0][0] for call in skip.call_args_list} + + assert skipped == { + str(server_url / 'page_1'), + str(server_url / 'page_2'), + str(server_url / 'page_3'), + } + + async def test_send_request(server_url: URL) -> None: """Check that the persist context works with fingerprints.""" check_data: dict[str, Any] = {}