Skip to content

Commit 5aa72c6

Browse files
authoredDec 2, 2024
Feat: review fixes (#34)
* Send results in response by rank * Add ccs selector to remove navigation/footer elements etc. Return simple HTML by default (add option to enable/disable) readability plugin. * Fix outputFormat in Standby mode * Fix css selector for removing attributes, tags. Remove search results when scraping single URL * Update lambda function
1 parent ec9be0c commit 5aa72c6

13 files changed

+100
-33
lines changed
 

‎.actor/actor.json

+1-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"name": "rag-web-browser",
44
"title": "RAG Web browser",
55
"description": "Web browser for OpenAI Assistants API and RAG pipelines, similar to a web browser in ChatGPT. It queries Google Search, scrapes the top N pages from the results, and returns their cleaned content as Markdown for further processing by an LLM.",
6-
"version": "0.1",
6+
"version": "1.0",
77
"input": "./input_schema.json",
88
"dockerfile": "./Dockerfile",
99
"storages": {

‎.actor/input_schema.json

+18-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,8 @@
3939
"minimum": 1,
4040
"maximum": 300,
4141
"default": 40,
42-
"unit": "seconds"
42+
"unit": "seconds",
43+
"editor": "hidden"
4344
},
4445
"serpProxyGroup": {
4546
"title": "SERP proxy group",
@@ -71,6 +72,22 @@
7172
"editor": "proxy",
7273
"sectionCaption": "Target pages scraping settings"
7374
},
75+
"removeElementsCssSelector": {
76+
"title": "Remove HTML elements (CSS selector)",
77+
"type": "string",
78+
"description": "A CSS selector matching HTML elements that will be removed from the DOM, before converting it to text, Markdown, or saving as HTML. This is useful to skip irrelevant page content. The value must be a valid CSS selector as accepted by the `document.querySelectorAll()` function. \n\nBy default, the Actor removes common navigation elements, headers, footers, modals, scripts, and inline image. You can disable the removal by setting this value to some non-existent CSS selector like `dummy_keep_everything`.",
79+
"editor": "textarea",
80+
"default": "nav, footer, script, style, noscript, svg, img[src^='data:'],\n[role=\"alert\"],\n[role=\"banner\"],\n[role=\"dialog\"],\n[role=\"alertdialog\"],\n[role=\"region\"][aria-label*=\"skip\" i],\n[aria-modal=\"true\"]",
81+
"prefill": "nav, footer, script, style, noscript, svg, img[src^='data:'],\n[role=\"alert\"],\n[role=\"banner\"],\n[role=\"dialog\"],\n[role=\"alertdialog\"],\n[role=\"region\"][aria-label*=\"skip\" i],\n[aria-modal=\"true\"]"
82+
},
83+
"htmlTransformer": {
84+
"title": "HTML transformer",
85+
"type": "string",
86+
"description": "Specify how to transform the HTML to extract meaningful content without any extra fluff, like navigation or modals. The HTML transformation happens after removing and clicking the DOM elements.\n\n- **None** (default) - Only removes the HTML elements specified via 'Remove HTML elements' option.\n\n- **Readable text** - Extracts the main contents of the webpage, without navigation and other fluff.",
87+
"default": "none",
88+
"prefill": "none",
89+
"editor": "hidden"
90+
},
7491
"initialConcurrency": {
7592
"title": "Initial browsing concurrency",
7693
"type": "integer",

‎README.md

-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,6 @@ the web page content directly like this:
5656
"httpStatusMessage": "OK",
5757
"loadedAt": "2024-11-21T14:04:28.090Z"
5858
},
59-
"searchResult": null,
6059
"metadata": {
6160
"url": "https://openai.com/index/introducing-chatgpt-search/",
6261
"title": "Introducing ChatGPT search | OpenAI",

‎docs/aws-lambda-call-rag-web-browser.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
ACTOR_BASE_URL = "https://rag-web-browser.apify.actor" # Base URL from OpenAPI schema
1414
MAX_RESULTS = 3 # Limit the number of results to decrease response size, limit 25KB
1515
TRUNCATE_TEXT_LENGTH = 5000 # Truncate the response body to decrease the response size, limit 25KB
16+
OUTPUT_FORMATS = "markdown" # Default output format
1617

1718
# Lambda function environment variable
1819
APIFY_API_TOKEN = os.getenv("APIFY_API_TOKEN")
@@ -37,7 +38,11 @@ def lambda_handler(event, context):
3738
# Limit the number of results to decrease response size
3839
# Getting: lambda response exceeds maximum size 25KB: 66945
3940
print("Query params: ", query_params)
40-
query_params["maxResults"] = min(3, int(query_params.get("maxResults", 3)))
41+
query_params["maxResults"] = min(MAX_RESULTS, int(query_params.get("maxResults", MAX_RESULTS)))
42+
43+
# Always return Markdown format
44+
query_params["outputFormats"] = query_params.get("outputFormats", OUTPUT_FORMATS) + f",{OUTPUT_FORMATS}"
45+
query_params["outputFormats"] = ",".join(set(query_params["outputFormats"].split(",")))
4146
print("Limited max results to: ", query_params["maxResults"])
4247

4348
try:
@@ -47,13 +52,16 @@ def lambda_handler(event, context):
4752
req = urllib.request.Request(url, headers=headers, method="GET")
4853
with urllib.request.urlopen(req) as response:
4954
response_body = response.read().decode("utf-8")
55+
print("Received response from RAG Web Browser", response_body)
5056

5157
else:
5258
return {"statusCode": 400, "body": json.dumps({"message": f"HTTP method {http_method} not supported"})}
5359

5460
response = json.loads(response_body)
61+
5562
# Truncate the response body to decrease the response size, there is a limit of 25KB
56-
body = [d["text"][:TRUNCATE_TEXT_LENGTH] + "..." for d in response]
63+
print("Truncating the response body")
64+
body = [d.get("markdown", "")[:TRUNCATE_TEXT_LENGTH] + "..." for d in response]
5765

5866
# Handle the API response
5967
action_response = {
@@ -70,6 +78,7 @@ def lambda_handler(event, context):
7078
return dummy_api_response
7179

7280
except Exception as e:
81+
print("Error occurred", e)
7382
return {"statusCode": 500, "body": json.dumps({"message": "Internal server error", "error": str(e)})}
7483

7584

@@ -88,4 +97,3 @@ def lambda_handler(event, context):
8897
"messageVersion": "1.0",
8998
}
9099
handler_response = lambda_handler(test_event, None)
91-
print("Response: ", handler_response)

‎src/const.ts

+2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ export const PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS = 60;
1111
export const defaults = {
1212
debugMode: false,
1313
dynamicContentWaitSecs: 10,
14+
htmlTransformer: 'none',
1415
initialConcurrency: 5,
1516
keepAlive: true,
1617
maxConcurrency: 10,
@@ -24,6 +25,7 @@ export const defaults = {
2425
query: null,
2526
readableTextCharThreshold: 100,
2627
removeCookieWarnings: true,
28+
removeElementsCssSelector: "nav, footer, script, style, noscript, svg, img[src^='data:'],\n[role=\"alert\"],\n[role=\"banner\"],\n[role=\"dialog\"],\n[role=\"alertdialog\"],\n[role=\"region\"][aria-label*=\"skip\" i],\n[aria-modal=\"true\"]",
2729
requestTimeoutSecs: 40,
2830
requestTimeoutSecsMax: 300,
2931
serpMaxRetries: 2,

‎src/crawlers.ts

+2
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,9 @@ async function createAndStartSearchCrawler(
8989

9090
addTimeMeasureEvent(request.userData!, 'before-playwright-queue-add');
9191
const responseId = request.uniqueKey;
92+
let rank = 1;
9293
for (const result of results) {
94+
result.rank = rank++;
9395
const r = createRequest(result, responseId, request.userData.timeMeasures!);
9496
await addPlaywrightCrawlRequest(r, responseId, request.userData.playwrightCrawlerKey!);
9597
}

‎src/input.ts

+11-4
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,19 @@ import { firefox } from 'playwright';
44

55
import { defaults } from './const.js';
66
import { UserInputError } from './errors.js';
7-
import type { Input, PlaywrightScraperSettings, OutputFormats } from './types.js';
7+
import type { Input, PlaywrightScraperSettings, OutputFormats, StandbyInput } from './types.js';
88

99
/**
1010
* Processes the input and returns the settings for the crawler (adapted from: Website Content Crawler).
1111
*/
1212

13-
export async function processInput(originalInput: Partial<Input>, standbyInit: boolean = false) {
13+
export async function processInput(
14+
originalInput: Partial<Input> | Partial<StandbyInput>,
15+
standbyInit: boolean = false,
16+
) {
17+
if (originalInput.outputFormats && typeof originalInput.outputFormats === 'string') {
18+
originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[];
19+
}
1420
const input = { ...defaults, ...originalInput } as Input;
1521

1622
validateAndFillInput(input, standbyInit);
@@ -24,7 +30,6 @@ export async function processInput(originalInput: Partial<Input>, standbyInit: b
2430
maxConcurrency,
2531
maxRequestRetries,
2632
serpMaxRetries,
27-
outputFormats,
2833
proxyConfiguration,
2934
serpProxyGroup,
3035
readableTextCharThreshold,
@@ -68,10 +73,12 @@ export async function processInput(originalInput: Partial<Input>, standbyInit: b
6873
const playwrightScraperSettings: PlaywrightScraperSettings = {
6974
debugMode,
7075
dynamicContentWaitSecs,
76+
htmlTransformer: 'none',
7177
maxHtmlCharsToProcess: 1.5e6,
72-
outputFormats,
78+
outputFormats: input.outputFormats as OutputFormats[],
7379
readableTextCharThreshold,
7480
removeCookieWarnings,
81+
removeElementsCssSelector: input.removeElementsCssSelector,
7582
};
7683

7784
return { input, cheerioCrawlerOptions, playwrightCrawlerOptions, playwrightScraperSettings };

‎src/main.ts

+7-5
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,11 @@ import { PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS } from './const.js';
66
import { addPlaywrightCrawlRequest, addSearchRequest, createAndStartCrawlers, getPlaywrightCrawlerKey } from './crawlers.js';
77
import { UserInputError } from './errors.js';
88
import { processInput } from './input.js';
9-
import { addTimeoutToAllResponses, sendResponseError } from './responses.js';
9+
import { addTimeoutToAllResponses, createResponse, sendResponseError } from './responses.js';
1010
import { Input } from './types.js';
1111
import {
1212
addTimeMeasureEvent,
13-
checkForExtraParams,
13+
checkAndRemoveExtraParams,
1414
createRequest,
1515
createSearchRequest,
1616
interpretAsUrl,
@@ -31,7 +31,7 @@ async function getSearch(request: IncomingMessage, response: ServerResponse) {
3131
const requestReceivedTime = Date.now();
3232
const params = parseParameters(request.url?.slice(ROUTE_SEARCH.length, request.url.length) ?? '');
3333
log.info(`Received query parameters: ${JSON.stringify(params)}`);
34-
checkForExtraParams(params);
34+
checkAndRemoveExtraParams(params);
3535

3636
// Process the query parameters the same way se normal inputs
3737
const {
@@ -48,8 +48,9 @@ async function getSearch(request: IncomingMessage, response: ServerResponse) {
4848
const inputUrl = interpretAsUrl(input.query);
4949
input.query = inputUrl ?? input.query;
5050
// Create a request depending on whether the input is a URL or search query
51+
const responseId = randomId();
5152
const req = inputUrl
52-
? createRequest({ url: input.query }, randomId(), null)
53+
? createRequest({ url: input.query }, responseId, null)
5354
: createSearchRequest(
5455
input.query,
5556
input.maxResults,
@@ -60,7 +61,8 @@ async function getSearch(request: IncomingMessage, response: ServerResponse) {
6061
if (inputUrl) {
6162
// If the input query is a URL, we don't need to run the search crawler
6263
log.info(`Skipping Google Search query as ${input.query} is a valid URL`);
63-
await addPlaywrightCrawlRequest(req, req.uniqueKey!, playwrightCrawlerKey);
64+
createResponse(responseId, response);
65+
await addPlaywrightCrawlRequest(req, responseId, playwrightCrawlerKey);
6466
} else {
6567
await addSearchRequest(req, response, cheerioCrawlerOptions);
6668
}

‎src/playwright-req-handler.ts

+13-8
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,12 @@ import { addTimeMeasureEvent, transformTimeMeasuresToRelative } from './utils.js
99
import { processHtml } from './website-content-crawler/html-processing.js';
1010
import { htmlToMarkdown } from './website-content-crawler/markdown.js';
1111

12-
const ACTOR_TIMEOUT_AT = process.env.ACTOR_TIMEOUT_AT ? parseInt(process.env.ACTOR_TIMEOUT_AT, 10) : null;
12+
let ACTOR_TIMEOUT_AT: number | undefined;
13+
try {
14+
ACTOR_TIMEOUT_AT = process.env.ACTOR_TIMEOUT_AT ? new Date(process.env.ACTOR_TIMEOUT_AT).getTime() : undefined;
15+
} catch (err) {
16+
ACTOR_TIMEOUT_AT = undefined;
17+
}
1318

1419
/**
1520
* Waits for the `time` to pass, but breaks early if the page is loaded (source: Website Content Crawler).
@@ -113,24 +118,24 @@ export async function requestHandlerPlaywright(
113118

114119
const result: Output = {
115120
crawl: {
116-
httpStatusCode: page ? response?.status() : null,
121+
httpStatusCode: page ? response?.status() : undefined,
117122
httpStatusMessage: 'OK',
118123
loadedAt: new Date(),
119124
uniqueKey: request.uniqueKey,
120125
requestStatus: ContentCrawlerStatus.HANDLED,
121126
},
122127
searchResult: request.userData.searchResult!,
123128
metadata: {
124-
author: $('meta[name=author]').first().attr('content') ?? null,
129+
author: $('meta[name=author]').first().attr('content') ?? undefined,
125130
title: $('title').first().text(),
126-
description: $('meta[name=description]').first().attr('content') ?? null,
127-
languageCode: $html.first().attr('lang') ?? null,
131+
description: $('meta[name=description]').first().attr('content') ?? undefined,
132+
languageCode: $html.first().attr('lang') ?? undefined,
128133
url: request.url,
129134
},
130135
query: request.userData.query,
131-
text: settings.outputFormats.includes('text') ? text : null,
132-
markdown: settings.outputFormats.includes('markdown') ? htmlToMarkdown(processedHtml) : null,
133-
html: settings.outputFormats.includes('html') ? processedHtml : null,
136+
text: settings.outputFormats.includes('text') ? text : undefined,
137+
markdown: settings.outputFormats.includes('markdown') ? htmlToMarkdown(processedHtml) : undefined,
138+
html: settings.outputFormats.includes('html') ? processedHtml : undefined,
134139
};
135140

136141
addTimeMeasureEvent(request.userData, 'playwright-before-response-send');

‎src/responses.ts

+16-3
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ export const sendResponseOk = (responseId: string, result: unknown, contentType:
7676
};
7777

7878
/**
79-
* Check if all results have been handled.
79+
* Check if all results have been handled. It is used to determine if the response can be sent.
8080
*/
8181
const checkAllResultsHandled = (responseId: string) => {
8282
const res = getResponse(responseId);
@@ -90,6 +90,19 @@ const checkAllResultsHandled = (responseId: string) => {
9090
return true;
9191
};
9292

93+
/**
94+
* Sort results by rank.
95+
*/
96+
const sortResultsByRank = (res: ResponseData): Output[] => {
97+
const resultsArray = Array.from(res.resultsMap.values());
98+
resultsArray.sort((a, b) => {
99+
const ra = a.searchResult.rank ?? Infinity;
100+
const rb = b.searchResult.rank ?? Infinity;
101+
return ra - rb;
102+
});
103+
return resultsArray;
104+
};
105+
93106
/**
94107
* Send response with error status code. If the response contains some handled requests,
95108
* return 200 status otherwise 500.
@@ -115,7 +128,7 @@ export const sendResponseError = (responseId: string, message: string) => {
115128
res.response.writeHead(returnStatusCode, { 'Content-Type': 'application/json' });
116129
if (returnStatusCode === 200) {
117130
log.warning(`Response for request ${responseId} has been sent with partial results`);
118-
res.response.end(JSON.stringify(Array.from(res.resultsMap.values())));
131+
res.response.end(JSON.stringify(sortResultsByRank(res)));
119132
} else {
120133
log.error(`Response for request ${responseId} has been sent with error: ${message}`);
121134
res.response.end(JSON.stringify({ errorMessage: message }));
@@ -131,7 +144,7 @@ export const sendResponseIfFinished = (responseId: string) => {
131144
if (!res) return;
132145

133146
if (checkAllResultsHandled(responseId)) {
134-
sendResponseOk(responseId, JSON.stringify(Array.from(res.resultsMap.values())), 'application/json');
147+
sendResponseOk(responseId, JSON.stringify(sortResultsByRank(res)), 'application/json');
135148
responseData.delete(responseId);
136149
}
137150
};

‎src/types.ts

+8
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,18 @@ export type Input = {
2626
minConcurrency: number;
2727
proxyConfiguration: ProxyConfigurationOptions;
2828
readableTextCharThreshold: number;
29+
removeElementsCssSelector: string;
2930
removeCookieWarnings: boolean;
3031
};
3132

33+
export type StandbyInput = Input & {
34+
outputFormats: OutputFormats[] | string
35+
}
36+
3237
export type OrganicResult = {
3338
description?: string;
3439
title?: string;
40+
rank?: number;
3541
url?: string;
3642
};
3743

@@ -72,10 +78,12 @@ export type UserData = {
7278
export interface PlaywrightScraperSettings {
7379
debugMode: boolean;
7480
dynamicContentWaitSecs: number;
81+
htmlTransformer?: string
7582
maxHtmlCharsToProcess: number;
7683
outputFormats: OutputFormats[];
7784
readableTextCharThreshold: number;
7885
removeCookieWarnings?: boolean;
86+
removeElementsCssSelector?: string;
7987
}
8088

8189
export type Output = {

‎src/utils.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ export function parseParameters(url: string): ParsedUrlQuery {
1111
/**
1212
* Check whether the query parameters are valid (do not support extra parameters)
1313
*/
14-
export function checkForExtraParams(params: ParsedUrlQuery) {
14+
export function checkAndRemoveExtraParams(params: ParsedUrlQuery) {
1515
const keys = Object.keys(defaults);
1616
keys.push('token', '?token'); // token is a special parameter
1717
for (const key of Object.keys(params)) {
@@ -78,7 +78,7 @@ export function createRequest(
7878
uniqueKey: randomId(),
7979
userData: {
8080
responseId,
81-
searchResult: result,
81+
searchResult: result.url && result.title ? result : undefined,
8282
timeMeasures: timeMeasures ? [...timeMeasures] : [],
8383
},
8484
};

‎src/website-content-crawler/html-processing.ts

+9-5
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,9 @@ export async function processHtml(
1414
$: CheerioAPI,
1515
): Promise<string> {
1616
const $body = $('body').clone();
17-
17+
if (settings.removeElementsCssSelector) {
18+
$body.find(settings.removeElementsCssSelector).remove();
19+
}
1820
const simplifiedBody = $body.html()?.trim();
1921

2022
const simplified = typeof simplifiedBody === 'string'
@@ -31,10 +33,12 @@ export async function processHtml(
3133
: (html ?? '');
3234

3335
let ret = null;
34-
try {
35-
ret = await readableText({ html: simplified, url, settings, options: { fallbackToNone: false } });
36-
} catch (error) {
37-
log.warning(`Processing of HTML failed with error:`, { error });
36+
if (settings.htmlTransformer === 'readableText') {
37+
try {
38+
ret = await readableText({ html: simplified, url, settings, options: { fallbackToNone: false } });
39+
} catch (error) {
40+
log.warning(`Processing of HTML failed with error:`, { error });
41+
}
3842
}
3943
return ret ?? (simplified as string);
4044
}

0 commit comments

Comments
 (0)
Please sign in to comment.