From 6be3dc82e6f164591fa0f7ae48a1e325261e2106 Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Fri, 25 Apr 2025 14:12:53 +0200
Subject: [PATCH 1/4] Add example guide for running in web server

---
 .../running_in_web_server/__init__.py         |  0
 .../running_in_web_server/crawler.py          | 54 +++++++++++++++++
 .../running_in_web_server/server.py           | 58 +++++++++++++++++++
 docs/guides/running_in_web_server.mdx         | 45 ++++++++++++++
 pyproject.toml                                | 14 ++++-
 5 files changed, 169 insertions(+), 2 deletions(-)
 create mode 100644 docs/guides/code_examples/running_in_web_server/__init__.py
 create mode 100644 docs/guides/code_examples/running_in_web_server/crawler.py
 create mode 100644 docs/guides/code_examples/running_in_web_server/server.py
 create mode 100644 docs/guides/running_in_web_server.mdx
diff --git a/docs/guides/code_examples/running_in_web_server/__init__.py b/docs/guides/code_examples/running_in_web_server/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/docs/guides/code_examples/running_in_web_server/crawler.py b/docs/guides/code_examples/running_in_web_server/crawler.py
new file mode 100644
index 0000000000..37c6671856
--- /dev/null
+++ b/docs/guides/code_examples/running_in_web_server/crawler.py
@@ -0,0 +1,54 @@
+import asyncio
+from collections.abc import AsyncIterator
+from contextlib import asynccontextmanager
+from typing import TypedDict
+
+from fastapi import FastAPI
+
+from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
+
+
+class State(TypedDict):
+    """State available in the app."""
+
+    crawler: ParselCrawler
+    requests_to_results: dict[str, asyncio.Future[dict[str, str]]]
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI) -> AsyncIterator[State]:
+    # Start up code that runs once when the app starts
+
+    # Results will be stored in this dictionary
+    requests_to_results = dict[str, asyncio.Future[dict[str, str]]]()
+
+    crawler = ParselCrawler(
+        # Keep the crawler alive even when there are no more requests to process now.
+        # This makes the crawler wait for more requests to be added later.
+        keep_alive=True
+    )
+
+    # Define the default request handler, which will be called for every request.
+    @crawler.router.default_handler
+    async def request_handler(context: ParselCrawlingContext) -> None:
+        context.log.info(f'Processing {context.request.url} ...')
+        title = context.selector.xpath('//title/text()').get() or ''
+
+        # Extract data from the page and save it to the result dictionary.
+        requests_to_results[context.request.unique_key].set_result(
+            {
+                'title': title,
+            }
+        )
+
+    # Start the crawler without awaiting it to finish
+    crawler.log.info(f'Starting crawler for the {app.title}')
+    run_task = asyncio.create_task(crawler.run([]))
+
+    # Make the crawler and the result dictionary available in the app state
+    yield {'crawler': crawler, 'requests_to_results': requests_to_results}
+
+    # Cleanup code that runs once when the app shuts down
+    crawler.stop()
+    # Wait for the crawler to finish
+    await run_task
diff --git a/docs/guides/code_examples/running_in_web_server/server.py b/docs/guides/code_examples/running_in_web_server/server.py
new file mode 100644
index 0000000000..64e192af37
--- /dev/null
+++ b/docs/guides/code_examples/running_in_web_server/server.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+import asyncio
+from uuid import uuid4
+
+from fastapi import FastAPI
+from starlette.requests import Request
+from starlette.responses import HTMLResponse
+
+import crawlee
+
+from .crawler import lifespan
+
+app = FastAPI(lifespan=lifespan, title='Crawler app')
+
+
+@app.get('/', response_class=HTMLResponse)
+def index() -> str:
+    return """
+<!DOCTYPE html>
+<html>
+<body>
+    <h1>Scraper server</h1>
+        <p>To scrape some page, visit "scrape" endpoint with url parameter.
+            For example:
+            <a href="/scrape?url=https://www.example.com">
+                /scrape?url=https://www.example.com
+            </a>
+        </p>
+</body>
+</html>
+"""
+
+
+@app.get('/scrape')
+async def scrape_url(request: Request, url: str | None = None) -> dict:
+    if not url:
+        return {'url': 'missing', 'scrape result': 'no results'}
+
+    # Generate random unique key for the request
+    unique_key = str(uuid4())
+
+    # Set the result future in the result dictionary so that it can be awaited
+    request.state.requests_to_results[unique_key] = asyncio.Future[dict[str, str]]()
+
+    # Add the request to the crawler queue
+    await request.state.crawler.add_requests(
+        [crawlee.Request.from_url(url, unique_key=unique_key)]
+    )
+
+    # Wait for the result future to be finished
+    result = await request.state.requests_to_results[unique_key]
+
+    # Clean the result from the result dictionary to free up memory
+    request.state.requests_to_results.pop(unique_key)
+
+    # Return the result
+    return {'url': url, 'scrape result': result}
diff --git a/docs/guides/running_in_web_server.mdx b/docs/guides/running_in_web_server.mdx
new file mode 100644
index 0000000000..a9d415bda1
--- /dev/null
+++ b/docs/guides/running_in_web_server.mdx
@@ -0,0 +1,45 @@
+---
+id: running-in-web-server
+title: Running in web server
+description: Running in web server
+---
+
+import ApiLink from '@site/src/components/ApiLink';
+import CodeBlock from '@theme/CodeBlock';
+
+import Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py';
+import Server from '!!raw-loader!./code_examples/running_in_web_server/server.py';
+
+# Running in web server
+
+Most of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in.
+
+We will build a simple HTTP server that receives a page URL and returns the page title in the response.
+
+# Set up a web server
+
+There are many popular web server frameworks for Python, such as Flask, Django, Pyramid, ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple.
+
+This will be our core server setup:
+
+<CodeBlock className="language-python">
+    {Server}
+</CodeBlock>
+
+The server has two endpoints.
+- `/` - The index is just giving short description of the server with example link to the second endpoint.
+- `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL
+
+To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and you can use the command `fastapi dev server.py` from the directory where the example code is located.
+
+# Create a crawler
+
+We will create a standard <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. This way it will always be waiting for new requests to come in.
+
+<CodeBlock className="language-python">
+    {Crawler}
+</CodeBlock>
+
+Crawler is defined inside of [Lifespan](https://fastapi.tiangolo.com/advanced/events/#lifespan) which is a FastAPI way to run some start up/ teardown code for the app. There are two objects that we want to save to the app state so that they can be accessed in any endpoint through `request.state`:
+- `crawler` holds instance of our crawler and allows the app to interact with it.
+- `requests_to_results` is dictionary that is used to temporarily register expected results for each request and populate them when they are made available by the crawler.
diff --git a/pyproject.toml b/pyproject.toml
index 2e7c846fe8..c75084608b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -183,8 +183,9 @@ indent-style = "space"
 "**/docs/examples/code_examples/*crawler_with_error_snapshotter.py" = [
     "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
 ]
-
-
+"**/docs/guides/code_examples/running_in_web_server/server.py" = [
+    "TC002", # uv false positive. Import actually needed during runtime.
+]
 
 [tool.ruff.lint.flake8-quotes]
 docstring-quotes = "double"
@@ -233,6 +234,8 @@ module = [
     "apify",                        # Example code shows integration of apify and crawlee.
     "apify_fingerprint_datapoints", # Untyped and stubs not available
     "camoufox",                     # Example code shows integration of camoufox and crawlee.
+    "fastapi",                      # Example code shows running in webserver.
+    "starlette.*",                  # Example code shows running in webserver.
     "flask",                        # Example code shows deploy on Google Cloud.
     "functions_framework",          # Example code shows deploy on Google Cloud.
     "jaro",                         # Untyped and stubs not available
@@ -244,8 +247,15 @@ module = [
     "cookiecutter.*",               # Untyped and stubs not available
     "inquirer.*",                   # Untyped and stubs not available
 ]
+disable_error_code = ["misc"]
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = [
+    "running_in_web_server.*"  # False positive when fastapi not available
+]
+disable_error_code = ["misc"]
+
 [tool.basedpyright]
 pythonVersion = "3.9"
 typeCheckingMode = "standard"

From 7181f0b5d880e1e168061042c8e4da293e1e5624 Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Mon, 28 Apr 2025 09:22:45 +0200
Subject: [PATCH 2/4] Review comments

---
 docs/guides/running_in_web_server.mdx | 13 ++++++-------
 pyproject.toml                        |  1 -
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/docs/guides/running_in_web_server.mdx b/docs/guides/running_in_web_server.mdx
index a9d415bda1..e41fb49163 100644
--- a/docs/guides/running_in_web_server.mdx
+++ b/docs/guides/running_in_web_server.mdx
@@ -10,19 +10,18 @@ import CodeBlock from '@theme/CodeBlock';
 import Crawler from '!!raw-loader!./code_examples/running_in_web_server/crawler.py';
 import Server from '!!raw-loader!./code_examples/running_in_web_server/server.py';
 
-# Running in web server
 
 Most of the time, Crawlee jobs are run as batch jobs. You have a list of URLs you want to scrape every week or you might want to scrape a whole website once per day. After the scrape, you send the data to your warehouse for analytics. Batch jobs are efficient because they can use Crawlee's built-in autoscaling to fully utilize the resources you have available. But sometimes you have a use-case where you need to return scrape data as soon as possible. There might be a user waiting on the other end so every millisecond counts. This is where running Crawlee in a web server comes in.
 
 We will build a simple HTTP server that receives a page URL and returns the page title in the response.
 
-# Set up a web server
+## Set up a web server
 
-There are many popular web server frameworks for Python, such as Flask, Django, Pyramid, ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple.
+There are many popular web server frameworks for Python, such as [Flask](https://flask.palletsprojects.com/en/stable/), [Django](https://www.djangoproject.com/), [Pyramid](https://trypyramid.com/), ... In this guide, we will use the [FastAPI](https://fastapi.tiangolo.com/) to keep things simple.
 
 This will be our core server setup:
 
-<CodeBlock className="language-python">
+<CodeBlock className="language-python" title="server.py">
     {Server}
 </CodeBlock>
 
@@ -30,13 +29,13 @@ The server has two endpoints.
 - `/` - The index is just giving short description of the server with example link to the second endpoint.
 - `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL
 
-To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and you can use the command `fastapi dev server.py` from the directory where the example code is located.
+To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and you can use the command ```fastapi dev server.py``` from the directory where the example code is located.
 
-# Create a crawler
+## Create a crawler
 
 We will create a standard <ApiLink to="class/ParselCrawler">`ParselCrawler`</ApiLink> and use the `keep_alive=true` option to keep the crawler running even if there are no requests currently in the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. This way it will always be waiting for new requests to come in.
 
-<CodeBlock className="language-python">
+<CodeBlock className="language-python" title="crawler.py">
     {Crawler}
 </CodeBlock>
 
diff --git a/pyproject.toml b/pyproject.toml
index c75084608b..2015f4c6e2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -247,7 +247,6 @@ module = [
     "cookiecutter.*",               # Untyped and stubs not available
     "inquirer.*",                   # Untyped and stubs not available
 ]
-disable_error_code = ["misc"]
 ignore_missing_imports = true
 
 [[tool.mypy.overrides]]

From bdb193892b364e5e87356ec72dda0363eae124f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Josef=20Proch=C3=A1zka?= <josef.prochazka@apify.com>
Date: Mon, 28 Apr 2025 13:26:10 +0200
Subject: [PATCH 3/4] Update comment pyproject.toml

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2015f4c6e2..fc8fdfe3ac 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -184,7 +184,7 @@ indent-style = "space"
     "PLR2004", # Magic value used in comparison. Ignored for simplicity and readability of example code.
 ]
 "**/docs/guides/code_examples/running_in_web_server/server.py" = [
-    "TC002", # uv false positive. Import actually needed during runtime.
+    "TC002", # ruff false positive. Import actually needed during runtime.
 ]
 
 [tool.ruff.lint.flake8-quotes]

From fc78092335c226268c2709501459bfc1729f36ec Mon Sep 17 00:00:00 2001
From: Josef Prochazka <josef.prochazka@apify.com>
Date: Tue, 29 Apr 2025 08:27:41 +0200
Subject: [PATCH 4/4] fastapi command on own line

---
 docs/guides/running_in_web_server.mdx | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/docs/guides/running_in_web_server.mdx b/docs/guides/running_in_web_server.mdx
index e41fb49163..63f907e616 100644
--- a/docs/guides/running_in_web_server.mdx
+++ b/docs/guides/running_in_web_server.mdx
@@ -29,7 +29,10 @@ The server has two endpoints.
 - `/` - The index is just giving short description of the server with example link to the second endpoint.
 - `/scrape` - This is the endpoint that receives a `url` parameter and returns the page title scraped from the URL
 
-To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and you can use the command ```fastapi dev server.py``` from the directory where the example code is located.
+To run the example server, make sure that you have installed the [fastapi[standard]](https://fastapi.tiangolo.com/#installation) and from the directory where the example code is located you can use the following command:
+```
+fastapi dev server.py
+```
 
 ## Create a crawler