Feat: review fixes (#34)

jirispilka · web-flow · commit 5aa72c66e501 · 2024-12-02T16:06:44.000+01:00
* Send results in response by rank
* Add ccs selector to remove navigation/footer elements etc. Return simple HTML by default (add option to enable/disable) readability plugin.
* Fix outputFormat in Standby mode
* Fix css selector for removing attributes, tags. Remove search results when scraping single URL
* Update lambda function
diff --git a/.actor/actor.json b/.actor/actor.json
@@ -3,7 +3,7 @@
     "name": "rag-web-browser",
     "title": "RAG Web browser",
     "description": "Web browser for OpenAI Assistants API and RAG pipelines, similar to a web browser in ChatGPT. It queries Google Search, scrapes the top N pages from the results, and returns their cleaned content as Markdown for further processing by an LLM.",
-    "version": "0.1",
+    "version": "1.0",
     "input": "./input_schema.json",
     "dockerfile": "./Dockerfile",
     "storages": {
diff --git a/.actor/input_schema.json b/.actor/input_schema.json
@@ -39,7 +39,8 @@
             "minimum": 1,
             "maximum": 300,
             "default": 40,
-            "unit": "seconds"
+            "unit": "seconds",
+            "editor": "hidden"
         },
         "serpProxyGroup": {
             "title": "SERP proxy group",
@@ -71,6 +72,22 @@
             "editor": "proxy",
             "sectionCaption": "Target pages scraping settings"
         },
+        "removeElementsCssSelector": {
+            "title": "Remove HTML elements (CSS selector)",
+            "type": "string",
+            "description": "A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.",
+            "editor": "textarea",
+            "default": "nav, footer, script, style, noscript, svg, img[src^='data:'],\n[role=\"alert\"],\n[role=\"banner\"],\n[role=\"dialog\"],\n[role=\"alertdialog\"],\n[role=\"region\"][aria-label*=\"skip\" i],\n[aria-modal=\"true\"]",
+            "prefill": "nav, footer, script, style, noscript, svg, img[src^='data:'],\n[role=\"alert\"],\n[role=\"banner\"],\n[role=\"dialog\"],\n[role=\"alertdialog\"],\n[role=\"region\"][aria-label*=\"skip\" i],\n[aria-modal=\"true\"]"
+        },
+        "htmlTransformer": {
+            "title": "HTML transformer",
+            "type": "string",
+            "description": "Specify how to transform the HTML to extract meaningful content without any extra fluff, like navigation or modals. The HTML transformation happens after removing and clicking the DOM elements.\n\n- **None** (default) - Only removes the HTML elements specified via 'Remove HTML elements' option.\n\n- **Readable text** - Extracts the main contents of the webpage, without navigation and other fluff.",
+            "default": "none",
+            "prefill": "none",
+            "editor": "hidden"
+        },
         "initialConcurrency": {
             "title": "Initial browsing concurrency",
             "type": "integer",
diff --git a/README.md b/README.md
@@ -56,7 +56,6 @@ the web page content directly like this:
         "httpStatusMessage": "OK",
         "loadedAt": "2024-11-21T14:04:28.090Z"
     },
-    "searchResult": null,
     "metadata": {
         "url": "https://openai.com/index/introducing-chatgpt-search/",
         "title": "Introducing ChatGPT search | OpenAI",
diff --git a/docs/aws-lambda-call-rag-web-browser.py b/docs/aws-lambda-call-rag-web-browser.py
@@ -13,6 +13,7 @@
 ACTOR_BASE_URL = "https://rag-web-browser.apify.actor"  # Base URL from OpenAPI schema
 MAX_RESULTS = 3  # Limit the number of results to decrease response size, limit 25KB
 TRUNCATE_TEXT_LENGTH = 5000  # Truncate the response body to decrease the response size, limit 25KB
+OUTPUT_FORMATS = "markdown"  # Default output format
 
 # Lambda function environment variable
 APIFY_API_TOKEN = os.getenv("APIFY_API_TOKEN")
@@ -37,7 +38,11 @@ def lambda_handler(event, context):
     # Limit the number of results to decrease response size
     # Getting: lambda response exceeds maximum size 25KB: 66945
     print("Query params: ", query_params)
-    query_params["maxResults"] = min(3, int(query_params.get("maxResults", 3)))
+    query_params["maxResults"] = min(MAX_RESULTS, int(query_params.get("maxResults", MAX_RESULTS)))
+
+    # Always return Markdown format
+    query_params["outputFormats"] = query_params.get("outputFormats", OUTPUT_FORMATS) + f",{OUTPUT_FORMATS}"
+    query_params["outputFormats"] = ",".join(set(query_params["outputFormats"].split(",")))
     print("Limited max results to: ", query_params["maxResults"])
 
     try:
@@ -47,13 +52,16 @@ def lambda_handler(event, context):
             req = urllib.request.Request(url, headers=headers, method="GET")
             with urllib.request.urlopen(req) as response:
                 response_body = response.read().decode("utf-8")
+                print("Received response from RAG Web Browser", response_body)
 
         else:
             return {"statusCode": 400, "body": json.dumps({"message": f"HTTP method {http_method} not supported"})}
 
         response = json.loads(response_body)
+
         # Truncate the response body to decrease the response size, there is a limit of 25KB
-        body = [d["text"][:TRUNCATE_TEXT_LENGTH] + "..." for d in response]
+        print("Truncating the response body")
+        body = [d.get("markdown", "")[:TRUNCATE_TEXT_LENGTH] + "..." for d in response]
 
         # Handle the API response
         action_response = {
@@ -70,6 +78,7 @@ def lambda_handler(event, context):
         return dummy_api_response
 
     except Exception as e:
+        print("Error occurred", e)
         return {"statusCode": 500, "body": json.dumps({"message": "Internal server error", "error": str(e)})}
 
 
@@ -88,4 +97,3 @@ def lambda_handler(event, context):
         "messageVersion": "1.0",
     }
     handler_response = lambda_handler(test_event, None)
-    print("Response: ", handler_response)
diff --git a/src/const.ts b/src/const.ts
@@ -11,6 +11,7 @@ export const PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS = 60;
 export const defaults = {
     debugMode: false,
     dynamicContentWaitSecs: 10,
+    htmlTransformer: 'none',
     initialConcurrency: 5,
     keepAlive: true,
     maxConcurrency: 10,
@@ -24,6 +25,7 @@ export const defaults = {
     query: null,
     readableTextCharThreshold: 100,
     removeCookieWarnings: true,
+    removeElementsCssSelector: "nav, footer, script, style, noscript, svg, img[src^='data:'],\n[role=\"alert\"],\n[role=\"banner\"],\n[role=\"dialog\"],\n[role=\"alertdialog\"],\n[role=\"region\"][aria-label*=\"skip\" i],\n[aria-modal=\"true\"]",
     requestTimeoutSecs: 40,
     requestTimeoutSecsMax: 300,
     serpMaxRetries: 2,
diff --git a/src/crawlers.ts b/src/crawlers.ts
@@ -89,7 +89,9 @@ async function createAndStartSearchCrawler(
 
             addTimeMeasureEvent(request.userData!, 'before-playwright-queue-add');
             const responseId = request.uniqueKey;
+            let rank = 1;
             for (const result of results) {
+                result.rank = rank++;
                 const r = createRequest(result, responseId, request.userData.timeMeasures!);
                 await addPlaywrightCrawlRequest(r, responseId, request.userData.playwrightCrawlerKey!);
             }
diff --git a/src/input.ts b/src/input.ts
@@ -4,13 +4,19 @@ import { firefox } from 'playwright';
 
 import { defaults } from './const.js';
 import { UserInputError } from './errors.js';
-import type { Input, PlaywrightScraperSettings, OutputFormats } from './types.js';
+import type { Input, PlaywrightScraperSettings, OutputFormats, StandbyInput } from './types.js';
 
 /**
  * Processes the input and returns the settings for the crawler (adapted from: Website Content Crawler).
  */
 
-export async function processInput(originalInput: Partial<Input>, standbyInit: boolean = false) {
+export async function processInput(
+    originalInput: Partial<Input> | Partial<StandbyInput>,
+    standbyInit: boolean = false,
+) {
+    if (originalInput.outputFormats && typeof originalInput.outputFormats === 'string') {
+        originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[];
+    }
     const input = { ...defaults, ...originalInput } as Input;
 
     validateAndFillInput(input, standbyInit);
@@ -24,7 +30,6 @@ export async function processInput(originalInput: Partial<Input>, standbyInit: b
         maxConcurrency,
         maxRequestRetries,
         serpMaxRetries,
-        outputFormats,
         proxyConfiguration,
         serpProxyGroup,
         readableTextCharThreshold,
@@ -68,10 +73,12 @@ export async function processInput(originalInput: Partial<Input>, standbyInit: b
     const playwrightScraperSettings: PlaywrightScraperSettings = {
         debugMode,
         dynamicContentWaitSecs,
+        htmlTransformer: 'none',
         maxHtmlCharsToProcess: 1.5e6,
-        outputFormats,
+        outputFormats: input.outputFormats as OutputFormats[],
         readableTextCharThreshold,
         removeCookieWarnings,
+        removeElementsCssSelector: input.removeElementsCssSelector,
     };
 
     return { input, cheerioCrawlerOptions, playwrightCrawlerOptions, playwrightScraperSettings };
diff --git a/src/main.ts b/src/main.ts
@@ -6,11 +6,11 @@ import { PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS } from './const.js';
 import { addPlaywrightCrawlRequest, addSearchRequest, createAndStartCrawlers, getPlaywrightCrawlerKey } from './crawlers.js';
 import { UserInputError } from './errors.js';
 import { processInput } from './input.js';
-import { addTimeoutToAllResponses, sendResponseError } from './responses.js';
+import { addTimeoutToAllResponses, createResponse, sendResponseError } from './responses.js';
 import { Input } from './types.js';
 import {
     addTimeMeasureEvent,
-    checkForExtraParams,
+    checkAndRemoveExtraParams,
     createRequest,
     createSearchRequest,
     interpretAsUrl,
@@ -31,7 +31,7 @@ async function getSearch(request: IncomingMessage, response: ServerResponse) {
         const requestReceivedTime = Date.now();
         const params = parseParameters(request.url?.slice(ROUTE_SEARCH.length, request.url.length) ?? '');
         log.info(`Received query parameters: ${JSON.stringify(params)}`);
-        checkForExtraParams(params);
+        checkAndRemoveExtraParams(params);
 
         // Process the query parameters the same way se normal inputs
         const {
@@ -48,8 +48,9 @@ async function getSearch(request: IncomingMessage, response: ServerResponse) {
         const inputUrl = interpretAsUrl(input.query);
         input.query = inputUrl ?? input.query;
         // Create a request depending on whether the input is a URL or search query
+        const responseId = randomId();
         const req = inputUrl
-            ? createRequest({ url: input.query }, randomId(), null)
+            ? createRequest({ url: input.query }, responseId, null)
             : createSearchRequest(
                 input.query,
                 input.maxResults,
@@ -60,7 +61,8 @@ async function getSearch(request: IncomingMessage, response: ServerResponse) {
         if (inputUrl) {
             // If the input query is a URL, we don't need to run the search crawler
             log.info(`Skipping Google Search query as ${input.query} is a valid URL`);
-            await addPlaywrightCrawlRequest(req, req.uniqueKey!, playwrightCrawlerKey);
+            createResponse(responseId, response);
+            await addPlaywrightCrawlRequest(req, responseId, playwrightCrawlerKey);
         } else {
             await addSearchRequest(req, response, cheerioCrawlerOptions);
         }
diff --git a/src/playwright-req-handler.ts b/src/playwright-req-handler.ts
@@ -9,7 +9,12 @@ import { addTimeMeasureEvent, transformTimeMeasuresToRelative } from './utils.js
 import { processHtml } from './website-content-crawler/html-processing.js';
 import { htmlToMarkdown } from './website-content-crawler/markdown.js';
 
-const ACTOR_TIMEOUT_AT = process.env.ACTOR_TIMEOUT_AT ? parseInt(process.env.ACTOR_TIMEOUT_AT, 10) : null;
+let ACTOR_TIMEOUT_AT: number | undefined;
+try {
+    ACTOR_TIMEOUT_AT = process.env.ACTOR_TIMEOUT_AT ? new Date(process.env.ACTOR_TIMEOUT_AT).getTime() : undefined;
+} catch (err) {
+    ACTOR_TIMEOUT_AT = undefined;
+}
 
 /**
  * Waits for the `time` to pass, but breaks early if the page is loaded (source: Website Content Crawler).
@@ -113,24 +118,24 @@ export async function requestHandlerPlaywright(
 
     const result: Output = {
         crawl: {
-            httpStatusCode: page ? response?.status() : null,
+            httpStatusCode: page ? response?.status() : undefined,
             httpStatusMessage: 'OK',
             loadedAt: new Date(),
             uniqueKey: request.uniqueKey,
             requestStatus: ContentCrawlerStatus.HANDLED,
         },
         searchResult: request.userData.searchResult!,
         metadata: {
-            author: $('meta[name=author]').first().attr('content') ?? null,
+            author: $('meta[name=author]').first().attr('content') ?? undefined,
             title: $('title').first().text(),
-            description: $('meta[name=description]').first().attr('content') ?? null,
-            languageCode: $html.first().attr('lang') ?? null,
+            description: $('meta[name=description]').first().attr('content') ?? undefined,
+            languageCode: $html.first().attr('lang') ?? undefined,
             url: request.url,
         },
         query: request.userData.query,
-        text: settings.outputFormats.includes('text') ? text : null,
-        markdown: settings.outputFormats.includes('markdown') ? htmlToMarkdown(processedHtml) : null,
-        html: settings.outputFormats.includes('html') ? processedHtml : null,
+        text: settings.outputFormats.includes('text') ? text : undefined,
+        markdown: settings.outputFormats.includes('markdown') ? htmlToMarkdown(processedHtml) : undefined,
+        html: settings.outputFormats.includes('html') ? processedHtml : undefined,
     };
 
     addTimeMeasureEvent(request.userData, 'playwright-before-response-send');
diff --git a/src/responses.ts b/src/responses.ts
@@ -76,7 +76,7 @@ export const sendResponseOk = (responseId: string, result: unknown, contentType:
 };
 
 /**
- * Check if all results have been handled.
+ * Check if all results have been handled. It is used to determine if the response can be sent.
  */
 const checkAllResultsHandled = (responseId: string) => {
     const res = getResponse(responseId);
@@ -90,6 +90,19 @@ const checkAllResultsHandled = (responseId: string) => {
     return true;
 };
 
+/**
+ * Sort results by rank.
+ */
+const sortResultsByRank = (res: ResponseData): Output[] => {
+    const resultsArray = Array.from(res.resultsMap.values());
+    resultsArray.sort((a, b) => {
+        const ra = a.searchResult.rank ?? Infinity;
+        const rb = b.searchResult.rank ?? Infinity;
+        return ra - rb;
+    });
+    return resultsArray;
+};
+
 /**
  * Send response with error status code. If the response contains some handled requests,
  * return 200 status otherwise 500.
@@ -115,7 +128,7 @@ export const sendResponseError = (responseId: string, message: string) => {
     res.response.writeHead(returnStatusCode, { 'Content-Type': 'application/json' });
     if (returnStatusCode === 200) {
         log.warning(`Response for request ${responseId} has been sent with partial results`);
-        res.response.end(JSON.stringify(Array.from(res.resultsMap.values())));
+        res.response.end(JSON.stringify(sortResultsByRank(res)));
     } else {
         log.error(`Response for request ${responseId} has been sent with error: ${message}`);
         res.response.end(JSON.stringify({ errorMessage: message }));
@@ -131,7 +144,7 @@ export const sendResponseIfFinished = (responseId: string) => {
     if (!res) return;
 
     if (checkAllResultsHandled(responseId)) {
-        sendResponseOk(responseId, JSON.stringify(Array.from(res.resultsMap.values())), 'application/json');
+        sendResponseOk(responseId, JSON.stringify(sortResultsByRank(res)), 'application/json');
         responseData.delete(responseId);
     }
 };
diff --git a/src/types.ts b/src/types.ts
@@ -26,12 +26,18 @@ export type Input = {
     minConcurrency: number;
     proxyConfiguration: ProxyConfigurationOptions;
     readableTextCharThreshold: number;
+    removeElementsCssSelector: string;
     removeCookieWarnings: boolean;
 };
 
+export type StandbyInput = Input & {
+    outputFormats: OutputFormats[] | string
+}
+
 export type OrganicResult = {
     description?: string;
     title?: string;
+    rank?: number;
     url?: string;
 };
 
@@ -72,10 +78,12 @@ export type UserData = {
 export interface PlaywrightScraperSettings {
     debugMode: boolean;
     dynamicContentWaitSecs: number;
+    htmlTransformer?: string
     maxHtmlCharsToProcess: number;
     outputFormats: OutputFormats[];
     readableTextCharThreshold: number;
     removeCookieWarnings?: boolean;
+    removeElementsCssSelector?: string;
 }
 
 export type Output = {
diff --git a/src/utils.ts b/src/utils.ts
@@ -11,7 +11,7 @@ export function parseParameters(url: string): ParsedUrlQuery {
 /**
  * Check whether the query parameters are valid (do not support extra parameters)
  */
-export function checkForExtraParams(params: ParsedUrlQuery) {
+export function checkAndRemoveExtraParams(params: ParsedUrlQuery) {
     const keys = Object.keys(defaults);
     keys.push('token', '?token'); // token is a special parameter
     for (const key of Object.keys(params)) {
@@ -78,7 +78,7 @@ export function createRequest(
         uniqueKey: randomId(),
         userData: {
             responseId,
-            searchResult: result,
+            searchResult: result.url && result.title ? result : undefined,
             timeMeasures: timeMeasures ? [...timeMeasures] : [],
         },
     };
diff --git a/src/website-content-crawler/html-processing.ts b/src/website-content-crawler/html-processing.ts
@@ -14,7 +14,9 @@ export async function processHtml(
     $: CheerioAPI,
 ): Promise<string> {
     const $body = $('body').clone();
-
+    if (settings.removeElementsCssSelector) {
+        $body.find(settings.removeElementsCssSelector).remove();
+    }
     const simplifiedBody = $body.html()?.trim();
 
     const simplified = typeof simplifiedBody === 'string'
@@ -31,10 +33,12 @@ export async function processHtml(
         : (html ?? '');
 
     let ret = null;
-    try {
-        ret = await readableText({ html: simplified, url, settings, options: { fallbackToNone: false } });
-    } catch (error) {
-        log.warning(`Processing of HTML failed with error:`, { error });
+    if (settings.htmlTransformer === 'readableText') {
+        try {
+            ret = await readableText({ html: simplified, url, settings, options: { fallbackToNone: false } });
+        } catch (error) {
+            log.warning(`Processing of HTML failed with error:`, { error });
+        }
     }
     return ret ?? (simplified as string);
 }

Original file line number	Diff line number	Diff line change
`@@ -89,7 +89,9 @@ async function createAndStartSearchCrawler(`
`89`	`89`
`90`	`90`	`addTimeMeasureEvent(request.userData!, 'before-playwright-queue-add');`
`91`	`91`	`const responseId = request.uniqueKey;`
	`92`	`+ let rank = 1;`
`92`	`93`	`for (const result of results) {`
	`94`	`+ result.rank = rank++;`
`93`	`95`	`const r = createRequest(result, responseId, request.userData.timeMeasures!);`
`94`	`96`	`await addPlaywrightCrawlRequest(r, responseId, request.userData.playwrightCrawlerKey!);`
`95`	`97`	`}`