From 7242fcadb096de8b756dad5b43160abf7a540d04 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 1 Dec 2025 09:33:00 +0100 Subject: [PATCH 1/2] feat: make Apify integration asynchronous --- engines/apify_api.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/engines/apify_api.py b/engines/apify_api.py index 1bcf6f9..35bb63c 100644 --- a/engines/apify_api.py +++ b/engines/apify_api.py @@ -1,8 +1,8 @@ import os try: - from apify_client import ApifyClient # type: ignore + from apify_client import ApifyClientAsync # type: ignore except Exception: # pragma: no cover - allow discovery without the dependency installed - ApifyClient = None # type: ignore[assignment] + ApifyClientAsync = None # type: ignore[assignment] from .base import Scraper, ScrapeResult from dotenv import load_dotenv import logging @@ -18,23 +18,23 @@ class ApifyAPIScraper(Scraper): """ def __init__(self): self.api_token = os.getenv("APIFY_API_TOKEN") - if ApifyClient is None: + if ApifyClientAsync is None: # Keep import-time lightweight so discovery works; fail when actually used raise RuntimeError("apify-client is not installed. Please `pip install apify-client`. ") if not self.api_token: raise RuntimeError("APIFY_API_TOKEN environment variable not set.") - self.client = ApifyClient(self.api_token) + self.client = ApifyClientAsync(self.api_token) self.actor_id = "apify/web-scraper" - def scrape(self, url: str, run_id: str) -> ScrapeResult: + async def scrape(self, url: str, run_id: str) -> ScrapeResult: error = None html = "" content_size = 0 - status_code = 500 + status_code = 500 try: # Start the actor and wait for it to finish actor_client = self.client.actor(self.actor_id) - run_result = actor_client.call( + run_result = await actor_client.call( run_input={ "startUrls": [{"url": url}], "maxRequestsPerCrawl": 1, @@ -68,7 +68,7 @@ def scrape(self, url: str, run_id: str) -> ScrapeResult: error = "No HTML found in Apify dataset result." except Exception as e: error = str(e) - + return ScrapeResult( run_id=run_id, scraper="apify_api", @@ -79,4 +79,4 @@ def scrape(self, url: str, run_id: str) -> ScrapeResult: format="html", created_at=datetime.now().isoformat(), content=html, - ) + ) From 5233a546e01b23a40bf893969e96098859e8d076 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jind=C5=99ich=20B=C3=A4r?= Date: Mon, 1 Dec 2025 09:50:54 +0100 Subject: [PATCH 2/2] fix: load dataset items asynchronously --- engines/apify_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/engines/apify_api.py b/engines/apify_api.py index 35bb63c..e7c1a03 100644 --- a/engines/apify_api.py +++ b/engines/apify_api.py @@ -59,7 +59,7 @@ async def scrape(self, url: str, run_id: str) -> ScrapeResult: else: dataset_id = run_result["defaultDatasetId"] dataset_client = self.client.dataset(dataset_id) - items = dataset_client.list_items().items + items = (await dataset_client.list_items()).items if items and "html" in items[0]: html = items[0]["html"] or "" status_code = items[0].get("status_code")