Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions engines/apify_api.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import os
try:
from apify_client import ApifyClient # type: ignore
from apify_client import ApifyClientAsync # type: ignore
except Exception: # pragma: no cover - allow discovery without the dependency installed
ApifyClient = None # type: ignore[assignment]
ApifyClientAsync = None # type: ignore[assignment]
from .base import Scraper, ScrapeResult
from dotenv import load_dotenv
import logging
Expand All @@ -18,23 +18,23 @@ class ApifyAPIScraper(Scraper):
"""
def __init__(self):
self.api_token = os.getenv("APIFY_API_TOKEN")
if ApifyClient is None:
if ApifyClientAsync is None:
# Keep import-time lightweight so discovery works; fail when actually used
raise RuntimeError("apify-client is not installed. Please `pip install apify-client`. ")
if not self.api_token:
raise RuntimeError("APIFY_API_TOKEN environment variable not set.")
self.client = ApifyClient(self.api_token)
self.client = ApifyClientAsync(self.api_token)
self.actor_id = "apify/web-scraper"

def scrape(self, url: str, run_id: str) -> ScrapeResult:
async def scrape(self, url: str, run_id: str) -> ScrapeResult:
error = None
html = ""
content_size = 0
status_code = 500
status_code = 500
try:
# Start the actor and wait for it to finish
actor_client = self.client.actor(self.actor_id)
run_result = actor_client.call(
run_result = await actor_client.call(
run_input={
"startUrls": [{"url": url}],
"maxRequestsPerCrawl": 1,
Expand All @@ -59,7 +59,7 @@ def scrape(self, url: str, run_id: str) -> ScrapeResult:
else:
dataset_id = run_result["defaultDatasetId"]
dataset_client = self.client.dataset(dataset_id)
items = dataset_client.list_items().items
items = (await dataset_client.list_items()).items
if items and "html" in items[0]:
html = items[0]["html"] or ""
status_code = items[0].get("status_code")
Expand All @@ -68,7 +68,7 @@ def scrape(self, url: str, run_id: str) -> ScrapeResult:
error = "No HTML found in Apify dataset result."
except Exception as e:
error = str(e)

return ScrapeResult(
run_id=run_id,
scraper="apify_api",
Expand All @@ -79,4 +79,4 @@ def scrape(self, url: str, run_id: str) -> ScrapeResult:
format="html",
created_at=datetime.now().isoformat(),
content=html,
)
)