apify · Pijukatel · Apr 29, 2025 · Apr 25, 2025 · Apr 28, 2025 · Apr 28, 2025
diff --git a/docs/guides/code_examples/running_in_web_server/__init__.py b/docs/guides/code_examples/running_in_web_server/__init__.py
diff --git a/docs/guides/code_examples/running_in_web_server/crawler.py b/docs/guides/code_examples/running_in_web_server/crawler.py
@@ -0,0 +1,54 @@
+import asyncio
+from collections.abc import AsyncIterator
+from contextlib import asynccontextmanager
+from typing import TypedDict
+
+from fastapi import FastAPI
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+
+
+class State(TypedDict):
+    """State available in the app."""
+
+    crawler: ParselCrawler
+    requests_to_results: dict[str, asyncio.Future[dict[str, str]]]
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI) -> AsyncIterator[State]:
+    # Start up code that runs once when the app starts
+
+    # Results will be stored in this dictionary
+    requests_to_results = dict[str, asyncio.Future[dict[str, str]]]()
+
+    crawler = ParselCrawler(
+        # Keep the crawler alive even when there are no more requests to process now.
+        # This makes the crawler wait for more requests to be added later.
+        keep_alive=True
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: ParselCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        title = context.selector.xpath('//title/text()').get() or ''
+
+        # Extract data from the page and save it to the result dictionary.
+        requests_to_results[context.request.unique_key].set_result(
+            {
+                'title': title,
+            }
+        )
+
+    # Start the crawler without awaiting it to finish
+    crawler.log.info(f'Starting crawler for the {app.title}')
+    run_task = asyncio.create_task(crawler.run([]))
+
+    # Make the crawler and the result dictionary available in the app state
+    yield {'crawler': crawler, 'requests_to_results': requests_to_results}
+
+    # Cleanup code that runs once when the app shuts down
+    crawler.stop()
+    # Wait for the crawler to finish
+    await run_task
diff --git a/docs/guides/code_examples/running_in_web_server/server.py b/docs/guides/code_examples/running_in_web_server/server.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import asyncio
+from uuid import uuid4
+
+from fastapi import FastAPI
+from starlette.requests import Request
+from starlette.responses import HTMLResponse
+
+import crawlee
+
+from .crawler import lifespan
+
+app = FastAPI(lifespan=lifespan, title='Crawler app')
+
+
+@app.get('/', response_class=HTMLResponse)
+def index() -> str:
+    return """
+<!DOCTYPE html>
+<html>
+<body>
+    <h1>Scraper server</h1>
+        <p>To scrape some page, visit "scrape" endpoint with url parameter.
+            For example:
+            <a href="/scrape?url=https://www.example.com">
+                /scrape?url=https://www.example.com
+            </a>
+        </p>
+</body>
+</html>
+"""
+
+
+@app.get('/scrape')
+async def scrape_url(request: Request, url: str | None = None) -> dict:
+    if not url:
+        return {'url': 'missing', 'scrape result': 'no results'}
+
+    # Generate random unique key for the request
+    unique_key = str(uuid4())
+
+    # Set the result future in the result dictionary so that it can be awaited
+    request.state.requests_to_results[unique_key] = asyncio.Future[dict[str, str]]()
+
+    # Add the request to the crawler queue
+    await request.state.crawler.add_requests(
+        [crawlee.Request.from_url(url, unique_key=unique_key)]
+    )
+
+    # Wait for the result future to be finished
+    result = await request.state.requests_to_results[unique_key]
+
+    # Clean the result from the result dictionary to free up memory
+    request.state.requests_to_results.pop(unique_key)
+
+    # Return the result
+    return {'url': url, 'scrape result': result}
diff --git a/docs/guides/running_in_web_server.mdx b/docs/guides/running_in_web_server.mdx
@@ -0,0 +1,47 @@
+---
+id: running-in-web-server
+title: Running in web server
+description: Running in web server
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py';
+import Server from '!!raw-loader!./code_examples/running_in_web_server/server.py';
+
+
+Most of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in.
+
+We will build a simple HTTP server that receives a page URL and returns the page title in the response.
+
+## Set up a web server
+
+There are many popular web server frameworks for Python, such as [Flask](https://flask.palletsprojects.com/en/stable/), [Django](https://www.djangoproject.com/), [Pyramid](https://trypyramid.com/), ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple.
+
+This will be our core server setup:
+
+<CodeBlock className="language-python" title="server.py">
+    {Server}
+</CodeBlock>
+
+The server has two endpoints.
+- `/` - The index is just giving short description of the server with example link to the second endpoint.
+- `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL
+
+To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and from the directory where the example code is located you can use the following command:
+```
+fastapi dev server.py
+```
+
+## Create a crawler
+
+We will create a standard <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. This way it will always be waiting for new requests to come in.
+
+<CodeBlock className="language-python" title="crawler.py">
+    {Crawler}
+</CodeBlock>
+
+Crawler is defined inside of [Lifespan](https://fastapi.tiangolo.com/advanced/events/#lifespan) which is a FastAPI way to run some start up/ teardown code for the app. There are two objects that we want to save to the app state so that they can be accessed in any endpoint through `request.state`:
+- `crawler` holds instance of our crawler and allows the app to interact with it.
+- `requests_to_results` is dictionary that is used to temporarily register expected results for each request and populate them when they are made available by the crawler.
diff --git a/pyproject.toml b/pyproject.toml
@@ -183,8 +183,9 @@ indent-style = "space"
 "**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [
     "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
 ]
-
-
+"**/docs/guides/code_examples/running_in_web_server/server.py" = [
+    "TC002", # ruff false positive. Import actually needed during runtime.
+]
 
 [tool.ruff.lint.flake8-quotes]
 docstring-quotes = "double"
@@ -233,6 +234,8 @@ module = [
     "apify",                        # Example code shows integration of apify and crawlee.
     "apify_fingerprint_datapoints", # Untyped and stubs not available
     "camoufox",                     # Example code shows integration of camoufox and crawlee.
+    "fastapi",                      # Example code shows running in webserver.
+    "starlette.*",                  # Example code shows running in webserver.
     "flask",                        # Example code shows deploy on Google Cloud.
     "functions_framework",          # Example code shows deploy on Google Cloud.
     "jaro",                         # Untyped and stubs not available
@@ -246,6 +249,12 @@ module = [
 ]
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = [
+    "running_in_web_server.*"  # False positive when fastapi not available
+]
+disable_error_code = ["misc"]
+
 [tool.basedpyright]
 pythonVersion = "3.9"
 typeCheckingMode = "standard"