apify · matyascimbulka · Mar 7, 2025 · Feb 28, 2025 · Feb 28, 2025 · Mar 4, 2025
diff --git a/.actor/Dockerfile b/.actor/Dockerfile
@@ -58,5 +58,8 @@ COPY --from=builder --chown=myuser /home/myuser/dist ./dist
 # for most source file changes.
 COPY --chown=myuser . ./
 
+# Disable experimental feature warning from Node.js
+ENV NODE_NO_WARNINGS=1
+
 # Run the image.
 CMD npm run start:prod --silent
diff --git a/.actor/input_schema.json b/.actor/input_schema.json
@@ -136,6 +136,15 @@
             "description": "If enabled, the Actor attempts to close or remove cookie consent dialogs to improve the quality of extracted text. Note that this setting increases the latency.",
             "default": true
         },
+        "scrapingTool": {
+            "title": "Which scraping tool to use",
+            "type": "string",
+            "description": "Choose what scraping tool to use for extracting the target web pages. The Browser tool is more powerful and can handle JavaScript heavy websites. While the Plain HTML tool is about two times faster.",
+            "editor": "select",
+            "default": "browser-playwright",
+            "enum": ["browser-playwright", "raw-http"],
+            "enumTitles": ["Browser (uses Playwright)", "Raw HTTP"]
+        },
         "debugMode": {
             "title": "Enable debug mode",
             "type": "boolean",

diff --git a/README.md b/README.md
@@ -116,6 +116,7 @@ The `/search` GET HTTP endpoint accepts the following query parameters:
 | `dynamicContentWaitSecs`     | number  | `10`          | The maximum time in seconds to wait for dynamic page content to load. The Actor considers the web page as fully loaded once this time elapses or when the network becomes idle.                                                                                                                                                                                                                                                                                                                                              |
 | `removeCookieWarnings`       | boolean | `true`        | If enabled, removes cookie consent dialogs to improve text extraction accuracy. This might increase latency.                                                                                                                                                                                                                                                                                                                                                                                                                 |
 | `removeElementsCssSelector`  | string  | `see input`   | A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.   |
+| `scrapingTool`               | string  | `browser-playwright`  | Selects which scraping tool is used to extract the target websits. `browser-playwright` uses browser and can handle complex Javascript heavy website. Meanwhile `raw-http` uses simple HTTP request to fetch the HTML provided by the URL, it can't handle websites that rely on Javascript but it's about two times faster.                                                                                                                                                                                                                                                                                                                                                                                                              |
 | `debugMode`                  | boolean | `false`       | If enabled, the Actor will store debugging information in the dataset's debug field.                                                                                                                                                                                                                                                                                                                                                                                                                                         |
 
 <!-- TODO: we should probably add proxyConfiguration -->

diff --git a/src/const.ts b/src/const.ts
@@ -1,4 +1,4 @@
-import inputSchema from '../.actor/input_schema.json' assert { type: 'json' };
+import inputSchema from '../.actor/input_schema.json' with { type: 'json' };
 
 export enum ContentCrawlerStatus {
     PENDING = 'pending',
@@ -12,6 +12,11 @@ export enum Routes {
     MESSAGE = '/message',
 }
 
+export enum ContentCrawlerTypes {
+    PLAYWRIGHT = 'playwright',
+    CHEERIO = 'cheerio',
+}
+
 export const PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS = 60;
 
 // Default values parsed from input_schema.json
@@ -38,4 +43,5 @@ export const defaults = {
     serpMaxRetries: inputSchema.properties.serpMaxRetries.default,
     serpMaxRetriesMax: inputSchema.properties.serpMaxRetries.maximum,
     serpProxyGroup: inputSchema.properties.serpProxyGroup.default,
+    scrapingTool: inputSchema.properties.scrapingTool.default,
 };
diff --git a/src/crawlers.ts b/src/crawlers.ts
@@ -12,10 +12,11 @@ import {
     RequestOptions,
 } from 'crawlee';
 
+import { ContentCrawlerTypes } from './const.js';
 import { scrapeOrganicResults } from './google-search/google-extractors-urls.js';
-import { failedRequestHandlerPlaywright, requestHandlerPlaywright } from './playwright-req-handler.js';
+import { failedRequestHandler, requestHandlerCheerio, requestHandlerPlaywright } from './request-handler.js';
 import { addEmptyResultToResponse, sendResponseError } from './responses.js';
-import type { PlaywrightCrawlerUserData, SearchCrawlerUserData } from './types.js';
+import type { ContentCrawlerOptions, ContentCrawlerUserData, SearchCrawlerUserData } from './types.js';
 import { addTimeMeasureEvent, createRequest } from './utils.js';
 
 const crawlers = new Map<string, CheerioCrawler | PlaywrightCrawler>();
@@ -25,42 +26,22 @@ export function getCrawlerKey(crawlerOptions: CheerioCrawlerOptions | Playwright
     return JSON.stringify(crawlerOptions);
 }
 
-/**
- * Creates and starts a Google search crawler and Playwright content crawler with the provided configurations.
- * A crawler won't be created if it already exists.
- */
-export async function createAndStartCrawlers(
-    cheerioCrawlerOptions: CheerioCrawlerOptions,
-    playwrightCrawlerOptions: PlaywrightCrawlerOptions,
-    startCrawlers: boolean = true,
-) {
-    const { crawler: searchCrawler } = await createAndStartSearchCrawler(
-        cheerioCrawlerOptions,
-        startCrawlers,
-    );
-    const { key: playwrightCrawlerKey, crawler: playwrightCrawler } = await createAndStartCrawlerPlaywright(
-        playwrightCrawlerOptions,
-        startCrawlers,
-    );
-    return { searchCrawler, playwrightCrawler, playwrightCrawlerKey };
-}
-
 /**
  * Creates and starts a Google search crawler with the provided configuration.
  * A crawler won't be created if it already exists.
  */
-async function createAndStartSearchCrawler(
-    cheerioCrawlerOptions: CheerioCrawlerOptions,
+export async function createAndStartSearchCrawler(
+    searchCrawlerOptions: CheerioCrawlerOptions,
     startCrawler: boolean = true,
 ) {
-    const key = getCrawlerKey(cheerioCrawlerOptions);
+    const key = getCrawlerKey(searchCrawlerOptions);
     if (crawlers.has(key)) {
         return { key, crawler: crawlers.get(key) };
     }
 
     log.info(`Creating new cheerio crawler with key ${key}`);
     const crawler = new CheerioCrawler({
-        ...(cheerioCrawlerOptions as CheerioCrawlerOptions),
+        ...(searchCrawlerOptions as CheerioCrawlerOptions),
         requestQueue: await RequestQueue.open(key, { storageClient: client }),
         requestHandler: async ({ request, $: _$ }: CheerioCrawlingContext<SearchCrawlerUserData>) => {
             // NOTE: we need to cast this to fix `cheerio` type errors
@@ -92,10 +73,10 @@ async function createAndStartSearchCrawler(
                     request.userData.query,
                     result,
                     responseId,
-                    request.userData.playwrightScraperSettings!,
+                    request.userData.contentScraperSettings!,
                     request.userData.timeMeasures!,
                 );
-                await addPlaywrightCrawlRequest(r, responseId, request.userData.playwrightCrawlerKey!);
+                await addContentCrawlRequest(r, responseId, request.userData.contentCrawlerKey!);
             }
         },
         failedRequestHandler: async ({ request }, err) => {
@@ -118,50 +99,78 @@ async function createAndStartSearchCrawler(
 }
 
 /**
- * Creates and starts a Playwright content crawler with the provided configuration.
+ * Creates and starts a content crawler with the provided configuration.
+ * Either Playwright or Cheerio crawler will be created based on the provided crawler options.
  * A crawler won't be created if it already exists.
  */
-async function createAndStartCrawlerPlaywright(
-    crawlerOptions: PlaywrightCrawlerOptions,
+export async function createAndStartContentCrawler(
+    contentCrawlerOptions: ContentCrawlerOptions,
     startCrawler: boolean = true,
 ) {
+    const { type: crawlerType, crawlerOptions } = contentCrawlerOptions;
+
     const key = getCrawlerKey(crawlerOptions);
     if (crawlers.has(key)) {
         return { key, crawler: crawlers.get(key) };
     }
 
-    log.info(`Creating new playwright crawler with key ${key}`);
-    const crawler = new PlaywrightCrawler({
-        ...(crawlerOptions as PlaywrightCrawlerOptions),
-        keepAlive: crawlerOptions.keepAlive,
-        requestQueue: await RequestQueue.open(key, { storageClient: client }),
-        requestHandler: async (context: PlaywrightCrawlingContext) => {
-            await requestHandlerPlaywright(context as unknown as PlaywrightCrawlingContext<PlaywrightCrawlerUserData>);
-        },
-        failedRequestHandler: ({ request }, err) => failedRequestHandlerPlaywright(request, err),
-    });
+    const crawler = crawlerType === 'playwright'
+        ? await createPlaywrightContentCrawler(crawlerOptions, key)
+        : await createCheerioContentCrawler(crawlerOptions, key);
 
     if (startCrawler) {
         crawler.run().then(
-            () => log.warning(`Crawler playwright has finished`),
+            () => log.warning(`Crawler ${crawlerType} has finished`),
             () => {},
         );
-        log.info('Crawler playwright has started 💪🏼');
+        log.info(`Crawler ${crawlerType} has started 💪🏼`);
     }
     crawlers.set(key, crawler);
     log.info(`Number of crawlers ${crawlers.size}`);
     return { key, crawler };
 }
 
+async function createPlaywrightContentCrawler(
+    crawlerOptions: PlaywrightCrawlerOptions,
+    key: string,
+): Promise<PlaywrightCrawler> {
+    log.info(`Creating new playwright crawler with key ${key}`);
+    return new PlaywrightCrawler({
+        ...crawlerOptions,
+        keepAlive: crawlerOptions.keepAlive,
+        requestQueue: await RequestQueue.open(key, { storageClient: client }),
+        requestHandler: async (context) => {
+            await requestHandlerPlaywright(context as unknown as PlaywrightCrawlingContext<ContentCrawlerUserData>);
+        },
+        failedRequestHandler: ({ request }, err) => failedRequestHandler(request, err, ContentCrawlerTypes.PLAYWRIGHT),
+    });
+}
+
+async function createCheerioContentCrawler(
+    crawlerOptions: CheerioCrawlerOptions,
+    key: string,
+): Promise<CheerioCrawler> {
+    log.info(`Creating new cheerio crawler with key ${key}`);
+    return new CheerioCrawler({
+        ...crawlerOptions,
+        keepAlive: crawlerOptions.keepAlive,
+        requestQueue: await RequestQueue.open(key, { storageClient: client }),
+        requestHandler: async (context) => {
+            await requestHandlerCheerio(context as unknown as CheerioCrawlingContext<ContentCrawlerUserData>);
+        },
+        failedRequestHandler: ({ request }, err) => failedRequestHandler(request, err, ContentCrawlerTypes.CHEERIO),
+    });
+}
+
 /**
  * Adds a search request to the Google search crawler.
  * Create a response for the request and set the desired number of results (maxResults).
  */
 export const addSearchRequest = async (
-    request: RequestOptions<PlaywrightCrawlerUserData>,
-    cheerioCrawlerOptions: CheerioCrawlerOptions,
+    request: RequestOptions<ContentCrawlerUserData>,
+    searchCrawlerOptions: CheerioCrawlerOptions,
 ) => {
-    const key = getCrawlerKey(cheerioCrawlerOptions);
+    const key = getCrawlerKey(searchCrawlerOptions);
     const crawler = crawlers.get(key);
 
     if (!crawler) {
@@ -174,26 +183,28 @@ export const addSearchRequest = async (
 };
 
 /**
- * Adds a content crawl request to the Playwright content crawler.
+ * Adds a content crawl request to selected content crawler.
  * Get existing crawler based on crawlerOptions and scraperSettings, if not present -> create new
  */
-export const addPlaywrightCrawlRequest = async (
-    request: RequestOptions<PlaywrightCrawlerUserData>,
+export const addContentCrawlRequest = async (
+    request: RequestOptions<ContentCrawlerUserData>,
     responseId: string,
-    playwrightCrawlerKey: string,
+    contentCrawlerKey: string,
 ) => {
-    const crawler = crawlers.get(playwrightCrawlerKey);
+    const crawler = crawlers.get(contentCrawlerKey);
+    const name = crawler instanceof PlaywrightCrawler ? 'playwright' : 'cheerio';
+
     if (!crawler) {
-        log.error(`Playwright crawler not found: key ${playwrightCrawlerKey}`);
+        log.error(`Content crawler not found: key ${contentCrawlerKey}`);
         return;
     }
     try {
         await crawler.requestQueue!.addRequest(request);
         // create an empty result in search request response
         // do not use request.uniqueKey as responseId as it is not id of a search request
         addEmptyResultToResponse(responseId, request);
-        log.info(`Added request to the playwright-content-crawler: ${request.url}`);
+        log.info(`Added request to the ${name}-content-crawler: ${request.url}`);
     } catch (err) {
-        log.error(`Error adding request to playwright-content-crawler: ${request.url}, error: ${err}`);
+        log.error(`Error adding request to ${name}-content-crawler: ${request.url}, error: ${err}`);
     }
 };