apify · jirispilka · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025 · Mar 18, 2025
diff --git a/README.md b/README.md
@@ -281,23 +281,31 @@ increase `requestTimeoutSecs` accordingly.
 
 Below is a typical latency breakdown for RAG Web Browser with **maxResults** set to either `1` or `3`, and various memory settings.
 These settings allow for processing all search results in parallel.
-The numbers below are based on the following search terms: "apify", "Donald Trump", "boston".
+The numbers below are based on the following search terms: "apify", "Donald Trump", "AI Agents".
 Results were averaged for the three queries.
 
-| Memory (GB) | Max results | Latency (sec) |
-|-------------|-------------|---------------|
-| 4           | 1           | 22            |
-| 4           | 3           | 31            |
-| 8           | 1           | 16            |
-| 8           | 3           | 17            |
+| Memory (GB) | Scraping Tool      | Max Results | Latency (sec) |
+|-------------|--------------------|-------------|---------------|
+| 8           | raw-http           | 1           | 3.4           |
+| 8           | browser-playwright | 1           | 8.9           |
+| 8           | raw-http           | 3           | 5.4           |
+| 8           | browser-playwright | 3           | 13.6          |
 
-Please note the these results are only indicative and may vary based on the search term, target websites, and network latency.
+| Memory (GB) | Scraping Tool      | Max Results | Latency (sec) |
+|-------------|--------------------|-------------|---------------|
+| 4           | raw-http           | 1           | 4.1           |
+| 4           | raw-http           | 3           | 4.8           |
+| 4           | browser-playwright | 1           | 16.5          |
+| 4           | browser-playwright | 3           | 20.6          |
+
+Please note that these results are only indicative and may vary based on the search term, target websites, and network latency.
 
 ## 💰 Pricing
 
 The RAG Web Browser is free of charge, and you only pay for the Apify platform consumption when it runs.
 The main driver of the price is the Actor compute units (CUs), which are proportional to the amount of Actor run memory
 and run time (1 CU = 1 GB memory x 1 hour).
+Another thing to consider is proxy traffic; residential proxies are more expensive than datacenter proxies.
 
 ## ⓘ Limitations and feedback
 

diff --git a/src/const.ts b/src/const.ts
@@ -21,6 +21,7 @@ export const PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS = 60;
 
 // Default values parsed from input_schema.json
 export const defaults = {
+    blockMedia: inputSchema.properties.blockMedia.default,
     debugMode: inputSchema.properties.debugMode.default,
     dynamicContentWaitSecs: inputSchema.properties.dynamicContentWaitSecs.default,
     htmlTransformer: inputSchema.properties.htmlTransformer.default,
@@ -37,12 +38,11 @@ export const defaults = {
     query: undefined, // No default value in input_schema.json
     readableTextCharThreshold: 100, // Not in input_schema.json
     removeCookieWarnings: inputSchema.properties.removeCookieWarnings.default,
-    blockMedia: inputSchema.properties.blockMedia.default,
     removeElementsCssSelector: inputSchema.properties.removeElementsCssSelector.default,
     requestTimeoutSecs: inputSchema.properties.requestTimeoutSecs.default,
     requestTimeoutSecsMax: inputSchema.properties.requestTimeoutSecs.maximum,
+    scrapingTool: inputSchema.properties.scrapingTool.default,
     serpMaxRetries: inputSchema.properties.serpMaxRetries.default,
     serpMaxRetriesMax: inputSchema.properties.serpMaxRetries.maximum,
     serpProxyGroup: inputSchema.properties.serpProxyGroup.default,
-    scrapingTool: inputSchema.properties.scrapingTool.default,
 };
diff --git a/src/crawlers.ts b/src/crawlers.ts
@@ -75,6 +75,7 @@ export async function createAndStartSearchCrawler(
                     responseId,
                     request.userData.contentScraperSettings!,
                     request.userData.timeMeasures!,
+                    request.userData.blockMedia,
                 );
                 await addContentCrawlRequest(r, responseId, request.userData.contentCrawlerKey!);
             }
@@ -130,6 +131,31 @@ export async function createAndStartContentCrawler(
     return { key, crawler };
 }
 
+/**
+ * PreNavigation hook that blocks resources based on the blockMedia setting
+ * from the request's userData.
+ * Only blocks resources if blockMedia is true.
+ */
+async function blockMediaResourcesHook({ page, request }: PlaywrightCrawlingContext<ContentCrawlerUserData>) {
+    await page.route('**/*', async (route) => {
+        const resourceType = route.request().resourceType();
+        const url = route.request().url();
+
+        // Block if it's an image/video/css resource type or has an image/video extension
+        if (request.userData.blockMedia && (
+            resourceType === 'image'
+            || resourceType === 'video'
+            || resourceType === 'media'
+            || resourceType === 'stylesheet'
+            || /\.(jpg|jpeg|png|gif|bmp|webp|mp4|webm|ogg|mov|css)$/i.test(url)
+        )) {
+            await route.abort();
+        } else {
+            await route.continue();
+        }
+    });
+}
+
 async function createPlaywrightContentCrawler(
     crawlerOptions: PlaywrightCrawlerOptions,
     key: string,
@@ -143,6 +169,11 @@ async function createPlaywrightContentCrawler(
             await requestHandlerPlaywright(context as unknown as PlaywrightCrawlingContext<ContentCrawlerUserData>);
         },
         failedRequestHandler: ({ request }, err) => failedRequestHandler(request, err, ContentCrawlerTypes.PLAYWRIGHT),
+        preNavigationHooks: [
+            async (context) => {
+                await blockMediaResourcesHook(context as unknown as PlaywrightCrawlingContext<ContentCrawlerUserData>);
+            },
+        ],
     });
 }
 

diff --git a/src/input.ts b/src/input.ts
@@ -46,8 +46,13 @@ async function processInputInternal(
     if (originalInput.outputFormats && typeof originalInput.outputFormats === 'string') {
         originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[];
     }
-    const input = { ...defaults, ...originalInput } as Input;
 
+    // noinspection SuspiciousTypeOfGuard
+    if (typeof originalInput.blockMedia === 'string') {
+        originalInput.blockMedia = originalInput.blockMedia === 'true' || originalInput.blockMedia === '1';
+    }
+
+    const input = { ...defaults, ...originalInput } as Input;
     validateAndFillInput(input, standbyInit);
 
     const {
@@ -111,27 +116,6 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration
                 maxConcurrency,
                 minConcurrency,
             },
-            preNavigationHooks: input.blockMedia ? [
-                async ({ page }) => {
-                    await page.route('**/*', async (route) => {
-                        const resourceType = route.request().resourceType();
-                        const url = route.request().url();
-
-                        // Block if it's an image/video/css resource type or has an image/video extension
-                        if (
-                            resourceType === 'image'
-                            || resourceType === 'video'
-                            || resourceType === 'media'
-                            || resourceType === 'stylesheet'
-                            || /\.(jpg|jpeg|png|gif|bmp|webp|mp4|webm|ogg|mov|css)$/i.test(url)
-                        ) {
-                            await route.abort();
-                        } else {
-                            await route.continue();
-                        }
-                    });
-                },
-            ] : [],
         },
     };
 }
@@ -207,4 +191,8 @@ export function validateAndFillInput(input: Input, standbyInit: boolean) {
     if (input.scrapingTool !== 'browser-playwright' && input.scrapingTool !== 'raw-http') {
         throw new UserInputError('The `scrapingTool` parameter must be either `browser-playwright` or `raw-http`.');
     }
+    // handle case when blockMedia is not defined, coerce blockMedia to boolean
+    if (input.blockMedia === undefined || input.blockMedia === null) {
+        input.blockMedia = defaults.blockMedia;
+    }
 }
diff --git a/src/performance-evaluation.ts b/src/performance-evaluation.ts
@@ -0,0 +1,142 @@
+/**
+ * Performance evaluation of the RAG Web Browser with respect to different settings.
+ * This script runs a series of queries and saves performance results into a dataset.
+ * The results include average time for each time measure event.
+ *
+ * The evaluation is performed with different combinations of the following parameters:
+ * - `scrapingTool`: The tool used for scraping (e.g., `raw-http`, `browser-playwright`).
+ * - `mediaBlocked`: Whether media content is blocked during scraping (true/false).
+ * - `maxResults`: The maximum number of results to scrape (e.g., 1, 3).
+ *
+ * The script performs the following steps:
+ * 1. Runs a set of predefined queries using different combinations of parameters.
+ * 2. Fetches the results and computes the average time for each time measure event.
+ * 3. Logs the performance results, including average latency for each combination of parameters.
+ * 4. Aborts the last run of the actor to ensure no resources are wasted.
+ *
+ * The results are stored in a table format, showing the average latency for each combination of parameters.
+ *
+ * Usage:
+ * - Ensure the `APIFY_TOKEN` environment variable is set with your Apify API token.
+ * - Run the script to perform the performance evaluation.
+ * - The results will be logged to the console.
+ */
+
+import { log } from 'apify';
+
+import { Output } from './types';
+
+const EVALUATION_QUERIES = [
+    'apify',
+    'donald trump',
+    'ai agents',
+];
+
+const apifyToken = process.env.APIFY_TOKEN;
+
+const user = 'jiri-spilka';
+const actorId = 'apify~rag-web-browser';
+const urlUserActor = `${user}--rag-web-browser-task`;
+
+const memory = 8; // memory can't be changed in the standby mode
+const scrapingToolSet = ['raw-http', 'browser-playwright'];
+const mediaBlockedSet = [true, false];
+const maxResultsSet = [1, 3];
+
+const url = `https://${urlUserActor}.apify.actor`;
+
+const headers = {
+    Accept: 'application/json',
+    Authorization: `Bearer ${apifyToken}`,
+};
+
+const results = new Map<string, Output[]>();
+const resultsTable = [];
+
+for (const scrapingTool of scrapingToolSet) {
+    for (const blockMedia of mediaBlockedSet) {
+        for (const maxResults of maxResultsSet) {
+            log.info(`Running ${EVALUATION_QUERIES.length} query/queries with ${scrapingTool}, mediaBlocked=${blockMedia}, maxResults=${maxResults}`);
+            log.info('Start in standby mode');
+            const r1 = await fetch(url, { method: 'GET', headers });
+            if (!r1.ok) {
+                throw new Error(`Failed to run the actor: ${JSON.stringify(await r1.json())}`);
+            } else {
+                // sleep for 10 seconds to let the actor start
+                await new Promise((resolve) => setTimeout(resolve, 10000));
+            }
+            for (const q of EVALUATION_QUERIES) {
+                const queryParams = new URLSearchParams({ query: q, scrapingTool, blockMedia: blockMedia.toString(), debugMode: 'true', maxResults: maxResults.toString() });
+                const urlWithParams = `${url}/search?${queryParams.toString()}`;
+                log.info(`Running ${urlWithParams}`);
+                const res = await fetch(urlWithParams, { method: 'GET', headers });
+                if (!res.ok) {
+                    throw new Error(`Failed to run the actor: ${JSON.stringify(await res.json())}`);
+                }
+                const data: Output[] = await res.json();
+                log.info(`Received number of results: ${data.length}`);
+                const k = `${scrapingTool}__${blockMedia ? 'blocked' : 'allowed'}__${maxResults}`;
+                if (results.has(k)) {
+                    results.set(k, [...results.get(k)!, ...data]);
+                } else {
+                    results.set(k, data);
+                }
+            }
+            log.info(`Get the last run: ${actorId}`);
+            const response = await fetch(`https://api.apify.com/v2/acts/${actorId}/runs/last`, { headers });
+            const resp = await response.json();
+            const { id: runId } = resp.data;
+
+            // it is better to abort run not to mix results and involve autoscaling into the mix
+            log.info(`Abort run ${runId}`);
+            const r = await fetch(`https://api.apify.com/v2/actor-runs/${runId}/abort`, { method: 'POST', headers });
+            log.info(`The last run has been aborted status=${r.status}`);
+        }
+    }
+}
+
+for (const [key, data] of results) {
+    const remoteDataset = data;
+    log.info('Compute average time for each time measure event');
+    const timeMeasuresMap = new Map<string, number[]>();
+    const timeMeasuresTimeTaken = [];
+
+    // compute average time for the timeMeasures
+    for (const item of remoteDataset) {
+        const { timeMeasures } = item.crawl.debug ?? {};
+        if (!timeMeasures) {
+            continue;
+        }
+        for (const measure of timeMeasures) {
+            if (!timeMeasuresMap.has(measure.event)) {
+                timeMeasuresMap.set(measure.event, []);
+            }
+            timeMeasuresMap.set(measure.event, [...timeMeasuresMap.get(measure.event)!, measure.timeDeltaPrevMs]);
+            if (measure.event === 'playwright-before-response-send' || measure.event === 'cheerio-before-response-send') {
+                timeMeasuresTimeTaken.push(measure.timeMs);
+            }
+        }
+    }
+    log.info(`Performance for key: ${key}`);
+    log.info('Average time for each time measure event:', timeMeasuresMap);
+
+    for (const [k, value] of timeMeasuresMap) {
+        const sum = value.reduce((a, b) => a + b, 0);
+        const avg = sum / value.length;
+        log.info(`${k}: ${avg.toFixed(0)} ms`);
+    }
+
+    const avgLatency = timeMeasuresTimeTaken.reduce((a, b) => a + b, 0) / timeMeasuresTimeTaken.length / 1000;
+    log.info('Time taken for each request:', timeMeasuresTimeTaken);
+    log.info('Time taken on average', { average: avgLatency.toFixed(1) });
+
+    // Store results for the table
+    const [scrapingTool, mediaBlocked, maxResults] = key.split('__');
+    resultsTable.push(`| ${memory} | ${scrapingTool} | ${mediaBlocked} | ${maxResults} | ${avgLatency.toFixed(1)} |`);
+}
+
+// Print the results table
+log.info('\nPerformance Results:');
+log.info('| Memory (GB) | Scraping Tool | Media | Max Results | Latency (sec) |');
+log.info('|-------------|---------------|---------------|-------------|---------------|');
+resultsTable.forEach((row) => log.info(row));
diff --git a/src/performance-measures.ts b/src/performance-measures.ts
diff --git a/src/search.ts b/src/search.ts
@@ -39,6 +39,7 @@ function prepareRequest(
             responseId,
             contentScraperSettings,
             null,
+            input.blockMedia,
         )
         : createSearchRequest(
             query,
@@ -47,6 +48,7 @@ function prepareRequest(
             contentCrawlerKey,
             searchCrawlerOptions.proxyConfiguration,
             contentScraperSettings,
+            input.blockMedia,
         );
 
     addTimeMeasureEvent(req.userData!, 'request-received', Date.now());