From 6be3dc82e6f164591fa0f7ae48a1e325261e2106 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Fri, 25 Apr 2025 14:12:53 +0200 Subject: [PATCH 1/4] Add example guide for running in web server --- .../running_in_web_server/__init__.py | 0 .../running_in_web_server/crawler.py | 54 +++++++++++++++++ .../running_in_web_server/server.py | 58 +++++++++++++++++++ docs/guides/running_in_web_server.mdx | 45 ++++++++++++++ pyproject.toml | 14 ++++- 5 files changed, 169 insertions(+), 2 deletions(-) create mode 100644 docs/guides/code_examples/running_in_web_server/__init__.py create mode 100644 docs/guides/code_examples/running_in_web_server/crawler.py create mode 100644 docs/guides/code_examples/running_in_web_server/server.py create mode 100644 docs/guides/running_in_web_server.mdx diff --git a/docs/guides/code_examples/running_in_web_server/__init__.py b/docs/guides/code_examples/running_in_web_server/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/docs/guides/code_examples/running_in_web_server/crawler.py b/docs/guides/code_examples/running_in_web_server/crawler.py new file mode 100644 index 0000000000..37c6671856 --- /dev/null +++ b/docs/guides/code_examples/running_in_web_server/crawler.py @@ -0,0 +1,54 @@ +import asyncio +from collections.abc import AsyncIterator +from contextlib import asynccontextmanager +from typing import TypedDict + +from fastapi import FastAPI + +from crawlee.crawlers import ParselCrawler, ParselCrawlingContext + + +class State(TypedDict): + """State available in the app.""" + + crawler: ParselCrawler + requests_to_results: dict[str, asyncio.Future[dict[str, str]]] + + +@asynccontextmanager +async def lifespan(app: FastAPI) -> AsyncIterator[State]: + # Start up code that runs once when the app starts + + # Results will be stored in this dictionary + requests_to_results = dict[str, asyncio.Future[dict[str, str]]]() + + crawler = ParselCrawler( + # Keep the crawler alive even when there are no more requests to process now. + # This makes the crawler wait for more requests to be added later. + keep_alive=True + ) + + # Define the default request handler, which will be called for every request. + @crawler.router.default_handler + async def request_handler(context: ParselCrawlingContext) -> None: + context.log.info(f'Processing {context.request.url} ...') + title = context.selector.xpath('//title/text()').get() or '' + + # Extract data from the page and save it to the result dictionary. + requests_to_results[context.request.unique_key].set_result( + { + 'title': title, + } + ) + + # Start the crawler without awaiting it to finish + crawler.log.info(f'Starting crawler for the {app.title}') + run_task = asyncio.create_task(crawler.run([])) + + # Make the crawler and the result dictionary available in the app state + yield {'crawler': crawler, 'requests_to_results': requests_to_results} + + # Cleanup code that runs once when the app shuts down + crawler.stop() + # Wait for the crawler to finish + await run_task diff --git a/docs/guides/code_examples/running_in_web_server/server.py b/docs/guides/code_examples/running_in_web_server/server.py new file mode 100644 index 0000000000..64e192af37 --- /dev/null +++ b/docs/guides/code_examples/running_in_web_server/server.py @@ -0,0 +1,58 @@ +from __future__ import annotations + +import asyncio +from uuid import uuid4 + +from fastapi import FastAPI +from starlette.requests import Request +from starlette.responses import HTMLResponse + +import crawlee + +from .crawler import lifespan + +app = FastAPI(lifespan=lifespan, title='Crawler app') + + +@app.get('/', response_class=HTMLResponse) +def index() -> str: + return """ + + + +

Scraper server

+

To scrape some page, visit "scrape" endpoint with url parameter. + For example: + + /scrape?url=https://www.example.com + +

+ + +""" + + +@app.get('/scrape') +async def scrape_url(request: Request, url: str | None = None) -> dict: + if not url: + return {'url': 'missing', 'scrape result': 'no results'} + + # Generate random unique key for the request + unique_key = str(uuid4()) + + # Set the result future in the result dictionary so that it can be awaited + request.state.requests_to_results[unique_key] = asyncio.Future[dict[str, str]]() + + # Add the request to the crawler queue + await request.state.crawler.add_requests( + [crawlee.Request.from_url(url, unique_key=unique_key)] + ) + + # Wait for the result future to be finished + result = await request.state.requests_to_results[unique_key] + + # Clean the result from the result dictionary to free up memory + request.state.requests_to_results.pop(unique_key) + + # Return the result + return {'url': url, 'scrape result': result} diff --git a/docs/guides/running_in_web_server.mdx b/docs/guides/running_in_web_server.mdx new file mode 100644 index 0000000000..a9d415bda1 --- /dev/null +++ b/docs/guides/running_in_web_server.mdx @@ -0,0 +1,45 @@ +--- +id: running-in-web-server +title: Running in web server +description: Running in web server +--- + +import ApiLink from '@site/src/components/ApiLink'; +import CodeBlock from '@theme/CodeBlock'; + +import Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py'; +import Server from '!!raw-loader!./code_examples/running_in_web_server/server.py'; + +# Running in web server + +Most of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in. + +We will build a simple HTTP server that receives a page URL and returns the page title in the response. + +# Set up a web server + +There are many popular web server frameworks for Python, such as Flask, Django, Pyramid, ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple. + +This will be our core server setup: + + + {Server} + + +The server has two endpoints. +- `/` - The index is just giving short description of the server with example link to the second endpoint. +- `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL + +To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and you can use the command `fastapi dev server.py` from the directory where the example code is located. + +# Create a crawler + +We will create a standard `ParselCrawler` and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the `RequestQueue`. This way it will always be waiting for new requests to come in. + + + {Crawler} + + +Crawler is defined inside of [Lifespan](https://fastapi.tiangolo.com/advanced/events/#lifespan) which is a FastAPI way to run some start up/ teardown code for the app. There are two objects that we want to save to the app state so that they can be accessed in any endpoint through `request.state`: +- `crawler` holds instance of our crawler and allows the app to interact with it. +- `requests_to_results` is dictionary that is used to temporarily register expected results for each request and populate them when they are made available by the crawler. diff --git a/pyproject.toml b/pyproject.toml index 2e7c846fe8..c75084608b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -183,8 +183,9 @@ indent-style = "space" "**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [ "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code. ] - - +"**/docs/guides/code_examples/running_in_web_server/server.py" = [ + "TC002", # uv false positive. Import actually needed during runtime. +] [tool.ruff.lint.flake8-quotes] docstring-quotes = "double" @@ -233,6 +234,8 @@ module = [ "apify", # Example code shows integration of apify and crawlee. "apify_fingerprint_datapoints", # Untyped and stubs not available "camoufox", # Example code shows integration of camoufox and crawlee. + "fastapi", # Example code shows running in webserver. + "starlette.*", # Example code shows running in webserver. "flask", # Example code shows deploy on Google Cloud. "functions_framework", # Example code shows deploy on Google Cloud. "jaro", # Untyped and stubs not available @@ -244,8 +247,15 @@ module = [ "cookiecutter.*", # Untyped and stubs not available "inquirer.*", # Untyped and stubs not available ] +disable_error_code = ["misc"] ignore_missing_imports = true +[[tool.mypy.overrides]] +module = [ + "running_in_web_server.*" # False positive when fastapi not available +] +disable_error_code = ["misc"] + [tool.basedpyright] pythonVersion = "3.9" typeCheckingMode = "standard" From 7181f0b5d880e1e168061042c8e4da293e1e5624 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Mon, 28 Apr 2025 09:22:45 +0200 Subject: [PATCH 2/4] Review comments --- docs/guides/running_in_web_server.mdx | 13 ++++++------- pyproject.toml | 1 - 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/docs/guides/running_in_web_server.mdx b/docs/guides/running_in_web_server.mdx index a9d415bda1..e41fb49163 100644 --- a/docs/guides/running_in_web_server.mdx +++ b/docs/guides/running_in_web_server.mdx @@ -10,19 +10,18 @@ import CodeBlock from '@theme/CodeBlock'; import Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py'; import Server from '!!raw-loader!./code_examples/running_in_web_server/server.py'; -# Running in web server Most of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in. We will build a simple HTTP server that receives a page URL and returns the page title in the response. -# Set up a web server +## Set up a web server -There are many popular web server frameworks for Python, such as Flask, Django, Pyramid, ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple. +There are many popular web server frameworks for Python, such as [Flask](https://flask.palletsprojects.com/en/stable/), [Django](https://www.djangoproject.com/), [Pyramid](https://trypyramid.com/), ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple. This will be our core server setup: - + {Server} @@ -30,13 +29,13 @@ The server has two endpoints. - `/` - The index is just giving short description of the server with example link to the second endpoint. - `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL -To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and you can use the command `fastapi dev server.py` from the directory where the example code is located. +To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and you can use the command ```fastapi dev server.py``` from the directory where the example code is located. -# Create a crawler +## Create a crawler We will create a standard `ParselCrawler` and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the `RequestQueue`. This way it will always be waiting for new requests to come in. - + {Crawler} diff --git a/pyproject.toml b/pyproject.toml index c75084608b..2015f4c6e2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -247,7 +247,6 @@ module = [ "cookiecutter.*", # Untyped and stubs not available "inquirer.*", # Untyped and stubs not available ] -disable_error_code = ["misc"] ignore_missing_imports = true [[tool.mypy.overrides]] From bdb193892b364e5e87356ec72dda0363eae124f3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= Date: Mon, 28 Apr 2025 13:26:10 +0200 Subject: [PATCH 3/4] Update comment pyproject.toml --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 2015f4c6e2..fc8fdfe3ac 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -184,7 +184,7 @@ indent-style = "space" "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code. ] "**/docs/guides/code_examples/running_in_web_server/server.py" = [ - "TC002", # uv false positive. Import actually needed during runtime. + "TC002", # ruff false positive. Import actually needed during runtime. ] [tool.ruff.lint.flake8-quotes] From fc78092335c226268c2709501459bfc1729f36ec Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 29 Apr 2025 08:27:41 +0200 Subject: [PATCH 4/4] fastapi command on own line --- docs/guides/running_in_web_server.mdx | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/guides/running_in_web_server.mdx b/docs/guides/running_in_web_server.mdx index e41fb49163..63f907e616 100644 --- a/docs/guides/running_in_web_server.mdx +++ b/docs/guides/running_in_web_server.mdx @@ -29,7 +29,10 @@ The server has two endpoints. - `/` - The index is just giving short description of the server with example link to the second endpoint. - `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL -To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and you can use the command ```fastapi dev server.py``` from the directory where the example code is located. +To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and from the directory where the example code is located you can use the following command: +``` +fastapi dev server.py +``` ## Create a crawler