Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
427b00a
basic_robots_allow
Mantisus Apr 17, 2025
638b5be
add respect robots_txt_file
Mantisus Apr 17, 2025
33be1c8
update load
Mantisus Apr 17, 2025
a44dff1
change `RobotFileParser` to `Protego`
Mantisus Apr 17, 2025
538672e
add tests
Mantisus Apr 17, 2025
b9b35be
fix
Mantisus Apr 17, 2025
a49ab66
update tests
Mantisus Apr 17, 2025
46a2356
update TODO comments
Mantisus Apr 17, 2025
10077b6
update docstrings
Mantisus Apr 17, 2025
358b20c
fix docstrings
Mantisus Apr 17, 2025
5282cf9
Merge branch 'respect_robots_txt' into on-skipped-request
Mantisus Apr 21, 2025
dfa2087
add on_skipped_request
Mantisus Apr 21, 2025
b92494e
change staticmethod to classmethod
Mantisus Apr 23, 2025
9243cac
Update src/crawlee/_utils/robots.py
Mantisus Apr 23, 2025
c50eabe
add _robots_txt_locks_cache
Mantisus Apr 23, 2025
b6baca8
update `pyproject.toml`
Mantisus Apr 23, 2025
9c7ad1c
update docstrings
Mantisus Apr 23, 2025
c9e6147
add docs example
Mantisus Apr 24, 2025
13a4c9f
chore(deps): update typescript-eslint monorepo to v8.31.0 (#1168)
renovate[bot] Apr 22, 2025
8db45af
chore(deps): update dependency setuptools to v79 (#1165)
renovate[bot] Apr 22, 2025
00916f6
fix: Update `UnprocessedRequest` to match actual data (#1155)
Pijukatel Apr 22, 2025
ca866cc
chore(release): Update changelog and package version [skip ci]
Apr 22, 2025
c5ab70d
fix: Fix the order in which cookies are saved to the `SessionCookies`…
Mantisus Apr 23, 2025
b960350
chore(release): Update changelog and package version [skip ci]
Apr 23, 2025
9eca2d0
feat: Handle unprocessed requests in `add_requests_batched` (#1159)
Pijukatel Apr 23, 2025
cc12d1b
chore(release): Update changelog and package version [skip ci]
Apr 23, 2025
1b7be37
fix: call `failed_request_handler` for `SessionError` when session ro…
Mantisus Apr 23, 2025
f67f9ca
chore(release): Update changelog and package version [skip ci]
Apr 23, 2025
821f891
one lock to rule them all
Mantisus Apr 24, 2025
d3324f0
Merge branch 'master' into on-skipped-request
Mantisus Apr 24, 2025
fca60a2
fix
Mantisus Apr 24, 2025
35fa23f
add docs
Mantisus Apr 25, 2025
e85153a
add file prefix
Mantisus Apr 28, 2025
fdf400c
resolve
Mantisus Apr 28, 2025
45a1bc1
resolve
Mantisus Apr 28, 2025
ce76eb5
resolve
Mantisus Apr 28, 2025
493157f
remove filename
Mantisus Apr 28, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions docs/examples/code_examples/respect_robots_on_skipped_request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import asyncio

from crawlee import SkippedReason
from crawlee.crawlers import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)


async def main() -> None:
# Initialize the crawler with robots.txt compliance enabled
crawler = BeautifulSoupCrawler(respect_robots_txt_file=True)

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
context.log.info(f'Processing {context.request.url} ...')

# highlight-start
# This handler is called when a request is skipped
@crawler.on_skipped_request
async def skipped_request_handler(url: str, reason: SkippedReason) -> None:
# Check if the request was skipped due to robots.txt rules
if reason == 'robots_txt':
crawler.log.info(f'Skipped {url} due to robots.txt rules.')

# highlight-end

# Start the crawler with the specified URLs
# The login URL will be skipped and handled by the skipped_request_handler
await crawler.run(
['https://news.ycombinator.com/', 'https://news.ycombinator.com/login']
)


if __name__ == '__main__':
asyncio.run(main())
11 changes: 11 additions & 0 deletions docs/examples/respect_robots_txt_file.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import ApiLink from '@site/src/components/ApiLink';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';

import RespectRobotsTxt from '!!raw-loader!roa-loader!./code_examples/respect_robots_txt_file.py';
import OnSkippedRequest from '!!raw-loader!roa-loader!./code_examples/respect_robots_on_skipped_request.py';

This example demonstrates how to configure your crawler to respect the rules established by websites for crawlers as described in the [robots.txt](https://www.robotstxt.org/robotstxt.html) file.

Expand All @@ -19,3 +20,13 @@ The code below demonstrates this behavior using the <ApiLink to="class/Beautiful
<RunnableCodeBlock className="language-python" language="python">
{RespectRobotsTxt}
</RunnableCodeBlock>

## Handle with `on_skipped_request`

If you want to process URLs skipped according to the `robots.txt` rules, for example for further analysis, you should use the `on_skipped_request` handler from <ApiLink to="class/BasicCrawler#on_skipped_request">`BasicCrawler`</ApiLink>.

Let's update the code by adding the `on_skipped_request` handler:

<RunnableCodeBlock className="language-python" language="python">
{OnSkippedRequest}
</RunnableCodeBlock>
3 changes: 2 additions & 1 deletion src/crawlee/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from ._request import Request, RequestOptions
from ._service_locator import service_locator
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction
from ._types import ConcurrencySettings, EnqueueStrategy, HttpHeaders, RequestTransformAction, SkippedReason
from ._utils.globs import Glob

__version__ = metadata.version('crawlee')
Expand All @@ -15,5 +15,6 @@
'Request',
'RequestOptions',
'RequestTransformAction',
'SkippedReason',
'service_locator',
]
2 changes: 2 additions & 0 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@
EnqueueStrategy: TypeAlias = Literal['all', 'same-domain', 'same-hostname', 'same-origin']
"""Enqueue strategy to be used for determining which links to extract and enqueue."""

SkippedReason: TypeAlias = Literal['robots_txt']


def _normalize_headers(headers: Mapping[str, str]) -> dict[str, str]:
"""Convert all header keys to lowercase, strips whitespace, and returns them sorted by key."""
Expand Down
11 changes: 9 additions & 2 deletions src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import asyncio
import logging
from abc import ABC
from typing import TYPE_CHECKING, Any, Callable, Generic, Union
Expand Down Expand Up @@ -157,6 +158,7 @@ async def extract_links(
kwargs.setdefault('strategy', 'same-hostname')

requests = list[Request]()
skipped = list[str]()
base_user_data = user_data or {}

robots_txt_file = await self._get_robots_txt_file_for_url(context.request.url)
Expand All @@ -168,8 +170,7 @@ async def extract_links(
url = convert_to_absolute_url(base_url, url)

if robots_txt_file and not robots_txt_file.is_allowed(url):
# TODO: https://github.com/apify/crawlee-python/issues/1160
# add processing with on_skipped_request hook
skipped.append(url)
continue

request_options = RequestOptions(url=url, user_data={**base_user_data}, label=label)
Expand All @@ -192,6 +193,12 @@ async def extract_links(
continue

requests.append(request)

if skipped:
skipped_tasks = [
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
]
await asyncio.gather(*skipped_tasks)
return requests

return extract_links
Expand Down
55 changes: 42 additions & 13 deletions src/crawlee/crawlers/_basic/_basic_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
HttpHeaders,
RequestHandlerRunResult,
SendRequestFunction,
SkippedReason,
)
from crawlee._utils.docs import docs_group
from crawlee._utils.robots import RobotsTxtFile
Expand Down Expand Up @@ -81,6 +82,7 @@
TStatisticsState = TypeVar('TStatisticsState', bound=StatisticsState, default=StatisticsState)
ErrorHandler = Callable[[TCrawlingContext, Exception], Awaitable[Union[Request, None]]]
FailedRequestHandler = Callable[[TCrawlingContext, Exception], Awaitable[None]]
SkippedRequestCallback = Callable[[str, SkippedReason], Awaitable[None]]


class _BasicCrawlerOptions(TypedDict):
Expand Down Expand Up @@ -335,9 +337,10 @@ def __init__(
self._router = None
self.router.default_handler(request_handler)

# Error & failed request handlers
# Error, failed & skipped request handlers
self._error_handler: ErrorHandler[TCrawlingContext | BasicCrawlingContext] | None = None
self._failed_request_handler: FailedRequestHandler[TCrawlingContext | BasicCrawlingContext] | None = None
self._on_skipped_request: SkippedRequestCallback | None = None
self._abort_on_error = abort_on_error

# Context of each request with matching result of request handler.
Expand Down Expand Up @@ -540,6 +543,14 @@ def failed_request_handler(
self._failed_request_handler = handler
return handler

def on_skipped_request(self, callback: SkippedRequestCallback) -> SkippedRequestCallback:
"""Register a function to handle skipped requests.

The skipped request handler is invoked when a request is skipped due to a collision or other reasons.
"""
self._on_skipped_request = callback
return callback

async def run(
self,
requests: Sequence[str | Request] | None = None,
Expand Down Expand Up @@ -676,8 +687,10 @@ async def add_requests(
skipped.append(request)

if skipped:
# TODO: https://github.com/apify/crawlee-python/issues/1160
# add processing with on_skipped_request hook
skipped_tasks = [
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
]
await asyncio.gather(*skipped_tasks)
self._logger.warning('Some requests were skipped because they were disallowed based on the robots.txt file')

request_manager = await self.get_request_manager()
Expand Down Expand Up @@ -996,6 +1009,30 @@ async def _handle_failed_request(self, context: TCrawlingContext | BasicCrawling
except Exception as e:
raise UserDefinedErrorHandlerError('Exception thrown in user-defined failed request handler') from e

async def _handle_skipped_request(
self, request: Request | str, reason: SkippedReason, *, need_mark: bool = False
) -> None:
if need_mark and isinstance(request, Request):
request_manager = await self.get_request_manager()

await wait_for(
lambda: request_manager.mark_request_as_handled(request),
timeout=self._internal_timeout,
timeout_message='Marking request as handled timed out after '
f'{self._internal_timeout.total_seconds()} seconds',
logger=self._logger,
max_retries=3,
)
request.state = RequestState.SKIPPED

url = request.url if isinstance(request, Request) else request

if self._on_skipped_request:
try:
await self._on_skipped_request(url, reason)
except Exception as e:
raise UserDefinedErrorHandlerError('Exception thrown in user-defined skipped request callback') from e

def _get_message_from_error(self, error: Exception) -> str:
"""Get error message summary from exception.

Expand Down Expand Up @@ -1152,16 +1189,8 @@ async def __run_task_function(self) -> None:
self._logger.warning(
f'Skipping request {request.url} ({request.id}) because it is disallowed based on robots.txt'
)
await wait_for(
lambda: request_manager.mark_request_as_handled(request),
timeout=self._internal_timeout,
timeout_message='Marking request as handled timed out after '
f'{self._internal_timeout.total_seconds()} seconds',
logger=self._logger,
max_retries=3,
)
# TODO: https://github.com/apify/crawlee-python/issues/1160
# add processing with on_skipped_request hook

await self._handle_skipped_request(request, 'robots_txt', need_mark=True)
return

if request.session_id:
Expand Down
11 changes: 9 additions & 2 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from __future__ import annotations

import asyncio
import logging
from functools import partial
from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union
Expand Down Expand Up @@ -292,6 +293,7 @@ async def extract_links(
kwargs.setdefault('strategy', 'same-hostname')

requests = list[Request]()
skipped = list[str]()
base_user_data = user_data or {}

elements = await context.page.query_selector_all(selector)
Expand All @@ -309,8 +311,7 @@ async def extract_links(
url = convert_to_absolute_url(base_url, url)

if robots_txt_file and not robots_txt_file.is_allowed(url):
# TODO: https://github.com/apify/crawlee-python/issues/1160
# add processing with on_skipped_request hook
skipped.append(url)
continue

request_option = RequestOptions({'url': url, 'user_data': {**base_user_data}, 'label': label})
Expand All @@ -334,6 +335,12 @@ async def extract_links(

requests.append(request)

if skipped:
skipped_tasks = [
asyncio.create_task(self._handle_skipped_request(request, 'robots_txt')) for request in skipped
]
await asyncio.gather(*skipped_tasks)

return requests

return extract_links
Expand Down
25 changes: 24 additions & 1 deletion tests/unit/crawlers/_beautifulsoup/test_beautifulsoup_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from typing import TYPE_CHECKING
from unittest import mock

from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction
from crawlee import ConcurrencySettings, HttpHeaders, RequestTransformAction, SkippedReason
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext

if TYPE_CHECKING:
Expand Down Expand Up @@ -160,3 +160,26 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
str(server_url / 'start_enqueue'),
str(server_url / 'sub_index'),
}


async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
crawler = BeautifulSoupCrawler(http_client=http_client, respect_robots_txt_file=True)
skip = mock.Mock()

@crawler.router.default_handler
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
await context.enqueue_links()

@crawler.on_skipped_request
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
skip(url)

await crawler.run([str(server_url / 'start_enqueue')])

skipped = {call[0][0] for call in skip.call_args_list}

assert skipped == {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
}
25 changes: 24 additions & 1 deletion tests/unit/crawlers/_parsel/test_parsel_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pytest

from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction
from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction, SkippedReason
from crawlee.crawlers import ParselCrawler

if TYPE_CHECKING:
Expand Down Expand Up @@ -256,3 +256,26 @@ async def request_handler(context: ParselCrawlingContext) -> None:
str(server_url / 'start_enqueue'),
str(server_url / 'sub_index'),
}


async def test_on_skipped_request(server_url: URL, http_client: HttpClient) -> None:
crawler = ParselCrawler(http_client=http_client, respect_robots_txt_file=True)
skip = mock.Mock()

@crawler.router.default_handler
async def request_handler(context: ParselCrawlingContext) -> None:
await context.enqueue_links()

@crawler.on_skipped_request
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
skip(url)

await crawler.run([str(server_url / 'start_enqueue')])

skipped = {call[0][0] for call in skip.call_args_list}

assert skipped == {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
}
25 changes: 24 additions & 1 deletion tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

import pytest

from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction
from crawlee import ConcurrencySettings, HttpHeaders, Request, RequestTransformAction, SkippedReason
from crawlee.crawlers import PlaywrightCrawler
from crawlee.fingerprint_suite import (
DefaultFingerprintGenerator,
Expand Down Expand Up @@ -618,6 +618,29 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
}


async def test_on_skipped_request(server_url: URL) -> None:
crawler = PlaywrightCrawler(respect_robots_txt_file=True)
skip = mock.Mock()

@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
await context.enqueue_links()

@crawler.on_skipped_request
async def skipped_hook(url: str, _reason: SkippedReason) -> None:
skip(url)

await crawler.run([str(server_url / 'start_enqueue')])

skipped = {call[0][0] for call in skip.call_args_list}

assert skipped == {
str(server_url / 'page_1'),
str(server_url / 'page_2'),
str(server_url / 'page_3'),
}


async def test_send_request(server_url: URL) -> None:
"""Check that the persist context works with fingerprints."""
check_data: dict[str, Any] = {}
Expand Down
Loading