Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 3 additions & 21 deletions .actor/input_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -97,33 +97,15 @@
"prefill": "none",
"editor": "hidden"
},
"initialConcurrency": {
"title": "Initial browsing concurrency",
"desiredConcurrency": {
"title": "Desired browsing concurrency",
"type": "integer",
"description": "The initial number of web browsers running in parallel. The system automatically scales the number based on the CPU and memory usage, in the range specified by `minConcurrency` and `maxConcurrency`. If the initial value is `0`, the Actor picks the number automatically based on the available memory.",
"description": "The desired number of web browsers running in parallel. The system automatically scales the number based on the CPU and memory usage. If the initial value is `0`, the Actor picks the number automatically based on the available memory.",
"minimum": 0,
"maximum": 50,
"default": 5,
"editor": "hidden"
},
"minConcurrency": {
"title": "Minimum browsing concurrency",
"type": "integer",
"description": "The minimum number of web browsers running in parallel.",
"minimum": 1,
"maximum": 50,
"default": 3,
"editor": "hidden"
},
"maxConcurrency": {
"title": "Maximum browsing concurrency",
"type": "integer",
"description": "The maximum number of web browsers running in parallel.",
"minimum": 1,
"maximum": 100,
"default": 50,
"editor": "hidden"
},
"maxRequestRetries": {
"title": "Target page max retries",
"type": "integer",
Expand Down
29 changes: 0 additions & 29 deletions src/const.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import inputSchema from '../.actor/input_schema.json' with { type: 'json' };

export enum ContentCrawlerStatus {
PENDING = 'pending',
HANDLED = 'handled',
Expand All @@ -18,30 +16,3 @@ export enum ContentCrawlerTypes {
}

export const PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS = 60;

// Default values parsed from input_schema.json
export const defaults = {
debugMode: inputSchema.properties.debugMode.default,
dynamicContentWaitSecs: inputSchema.properties.dynamicContentWaitSecs.default,
htmlTransformer: inputSchema.properties.htmlTransformer.default,
initialConcurrency: inputSchema.properties.initialConcurrency.default,
keepAlive: true, // Not in input_schema.json
maxConcurrency: inputSchema.properties.maxConcurrency.default,
maxRequestRetries: inputSchema.properties.maxRequestRetries.default,
maxRequestRetriesMax: inputSchema.properties.maxRequestRetries.maximum,
maxResults: inputSchema.properties.maxResults.default,
maxResultsMax: inputSchema.properties.maxResults.maximum,
minConcurrency: inputSchema.properties.minConcurrency.default,
outputFormats: inputSchema.properties.outputFormats.default,
proxyConfiguration: inputSchema.properties.proxyConfiguration.default,
query: undefined, // No default value in input_schema.json
readableTextCharThreshold: 100, // Not in input_schema.json
removeCookieWarnings: inputSchema.properties.removeCookieWarnings.default,
removeElementsCssSelector: inputSchema.properties.removeElementsCssSelector.default,
requestTimeoutSecs: inputSchema.properties.requestTimeoutSecs.default,
requestTimeoutSecsMax: inputSchema.properties.requestTimeoutSecs.maximum,
serpMaxRetries: inputSchema.properties.serpMaxRetries.default,
serpMaxRetriesMax: inputSchema.properties.serpMaxRetries.maximum,
serpProxyGroup: inputSchema.properties.serpProxyGroup.default,
scrapingTool: inputSchema.properties.scrapingTool.default,
};
171 changes: 131 additions & 40 deletions src/input.ts
Original file line number Diff line number Diff line change
@@ -1,16 +1,24 @@
import { Actor } from 'apify';
import { Actor, ProxyConfigurationOptions } from 'apify';
import { BrowserName, CheerioCrawlerOptions, log, ProxyConfiguration } from 'crawlee';
import { firefox } from 'playwright';

import { ContentCrawlerTypes, defaults } from './const.js';
import { ContentCrawlerTypes } from './const.js';
import { UserInputError } from './errors.js';
import type { Input, ContentScraperSettings, OutputFormats, StandbyInput, ContentCrawlerOptions } from './types.js';
import type {
Input,
ContentScraperSettings,
OutputFormats,
ContentCrawlerOptions,
ScrapingTool,
SERPProxyGroup,
} from './types.js';
import inputSchema from '../.actor/input_schema.json' with { type: 'json' };

/**
* Processes the input and returns an array of crawler settings. This is ideal for startup of STANDBY mode
* because it makes it simple to start all crawlers at once.
*/
export async function processStandbyInput(originalInput: Partial<Input> | Partial<StandbyInput>) {
export async function processStandbyInput(originalInput: Partial<Input>) {
const { input, searchCrawlerOptions, contentScraperSettings } = await processInputInternal(originalInput, true);

const proxy = await Actor.createProxyConfiguration(input.proxyConfiguration);
Expand All @@ -25,13 +33,13 @@ export async function processStandbyInput(originalInput: Partial<Input> | Partia
/**
* Processes the input and returns the settings for the crawler.
*/
export async function processInput(originalInput: Partial<Input> | Partial<StandbyInput>) {
export async function processInput(originalInput: Partial<Input>) {
const { input, searchCrawlerOptions, contentScraperSettings } = await processInputInternal(originalInput);

const proxy = await Actor.createProxyConfiguration(input.proxyConfiguration);
const contentCrawlerOptions: ContentCrawlerOptions = input.scrapingTool === 'raw-http'
? createCheerioCrawlerOptions(input, proxy)
: createPlaywrightCrawlerOptions(input, proxy);
? createCheerioCrawlerOptions(input, proxy, false)
: createPlaywrightCrawlerOptions(input, proxy, false);

return { input, searchCrawlerOptions, contentCrawlerOptions, contentScraperSettings };
}
Expand All @@ -40,31 +48,30 @@ export async function processInput(originalInput: Partial<Input> | Partial<Stand
* Processes the input and returns the settings for the crawler (adapted from: Website Content Crawler).
*/
async function processInputInternal(
originalInput: Partial<Input> | Partial<StandbyInput>,
originalInput: Partial<Input>,
standbyInit: boolean = false,
) {
if (originalInput.outputFormats && typeof originalInput.outputFormats === 'string') {
originalInput.outputFormats = originalInput.outputFormats.split(',').map((format) => format.trim()) as OutputFormats[];
}
const input = { ...defaults, ...originalInput } as Input;
// const input = { ...defaults, ...originalInput } as Input;

validateAndFillInput(input, standbyInit);
const input = validateAndFillInput(originalInput, standbyInit);

const {
debugMode,
dynamicContentWaitSecs,
keepAlive,
serpMaxRetries,
serpProxyGroup,
outputFormats,
readableTextCharThreshold,
removeElementsCssSelector,
htmlTransformer,
removeCookieWarnings,
} = input;

log.setLevel(debugMode ? log.LEVELS.DEBUG : log.LEVELS.INFO);

const proxySearch = await Actor.createProxyConfiguration({ groups: [serpProxyGroup] });
const searchCrawlerOptions: CheerioCrawlerOptions = {
keepAlive,
keepAlive: standbyInit,
maxRequestRetries: serpMaxRetries,
proxyConfiguration: proxySearch,
autoscaledPoolOptions: { desiredConcurrency: 1 },
Expand All @@ -73,19 +80,23 @@ async function processInputInternal(
const contentScraperSettings: ContentScraperSettings = {
debugMode,
dynamicContentWaitSecs,
htmlTransformer: 'none',
htmlTransformer,
maxHtmlCharsToProcess: 1.5e6,
outputFormats: input.outputFormats as OutputFormats[],
outputFormats,
readableTextCharThreshold,
removeCookieWarnings,
removeElementsCssSelector: input.removeElementsCssSelector,
removeElementsCssSelector,
};

return { input, searchCrawlerOptions, contentScraperSettings };
}

function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration | undefined): ContentCrawlerOptions {
const { keepAlive, maxRequestRetries, initialConcurrency, maxConcurrency, minConcurrency } = input;
function createPlaywrightCrawlerOptions(
input: Input,
proxy: ProxyConfiguration | undefined,
keepAlive: boolean = true,
): ContentCrawlerOptions {
const { maxRequestRetries, desiredConcurrency } = input;

return {
type: ContentCrawlerTypes.PLAYWRIGHT,
Expand All @@ -107,16 +118,18 @@ function createPlaywrightCrawlerOptions(input: Input, proxy: ProxyConfiguration
retireInactiveBrowserAfterSecs: 60,
},
autoscaledPoolOptions: {
desiredConcurrency: initialConcurrency === 0 ? undefined : Math.min(initialConcurrency, maxConcurrency),
maxConcurrency,
minConcurrency,
desiredConcurrency,
},
},
};
}

function createCheerioCrawlerOptions(input: Input, proxy: ProxyConfiguration | undefined): ContentCrawlerOptions {
const { keepAlive, maxRequestRetries, initialConcurrency, maxConcurrency, minConcurrency } = input;
function createCheerioCrawlerOptions(
input: Input,
proxy: ProxyConfiguration | undefined,
keepAlive: boolean = true,
): ContentCrawlerOptions {
const { maxRequestRetries, desiredConcurrency } = input;

return {
type: ContentCrawlerTypes.CHEERIO,
Expand All @@ -126,9 +139,7 @@ function createCheerioCrawlerOptions(input: Input, proxy: ProxyConfiguration | u
proxyConfiguration: proxy,
requestHandlerTimeoutSecs: input.requestTimeoutSecs,
autoscaledPoolOptions: {
desiredConcurrency: initialConcurrency === 0 ? undefined : Math.min(initialConcurrency, maxConcurrency),
maxConcurrency,
minConcurrency,
desiredConcurrency,
},
},
};
Expand All @@ -139,7 +150,7 @@ function createCheerioCrawlerOptions(input: Input, proxy: ProxyConfiguration | u
* Do not validate query parameter when standbyInit is true.
* This is a bit ugly, but it's necessary to avoid throwing an error when the query is not provided in standby mode.
*/
export function validateAndFillInput(input: Input, standbyInit: boolean) {
function validateAndFillInput(input: Partial<Input>, standbyInit: boolean): Input {
const validateRange = (
value: number | string | undefined,
min: number,
Expand All @@ -149,7 +160,7 @@ export function validateAndFillInput(input: Input, standbyInit: boolean) {
) => {
// parse the value as a number to check if it's a valid number
if (value === undefined) {
log.warning(`The \`${fieldName}\` parameter must be defined. Using the default value ${defaultValue} instead.`);
log.info(`The \`${fieldName}\` parameter is not defined. Using the default value ${defaultValue}.`);
return defaultValue;
} if (typeof value === 'string') {
value = Number(value);
Expand All @@ -162,28 +173,108 @@ export function validateAndFillInput(input: Input, standbyInit: boolean) {
}
return value;
};

// Throw an error if the query is not provided and standbyInit is false.
if (!input.query && !standbyInit) {
throw new UserInputError('The `query` parameter must be provided and non-empty.');
}

input.maxResults = validateRange(input.maxResults, 1, defaults.maxResultsMax, defaults.maxResults, 'maxResults');
input.requestTimeoutSecs = validateRange(input.requestTimeoutSecs, 1, defaults.requestTimeoutSecsMax, defaults.requestTimeoutSecs, 'requestTimeoutSecs');
input.serpMaxRetries = validateRange(input.serpMaxRetries, 0, defaults.serpMaxRetriesMax, defaults.serpMaxRetries, 'serpMaxRetries');
input.maxRequestRetries = validateRange(input.maxRequestRetries, 0, defaults.maxRequestRetriesMax, defaults.maxRequestRetries, 'maxRequestRetries');
// Max results
input.maxResults = validateRange(
input.maxResults,
inputSchema.properties.maxResults.minimum,
inputSchema.properties.maxResults.maximum,
inputSchema.properties.maxResults.default,
'maxResults',
);

// Output formats
if (!input.outputFormats || input.outputFormats.length === 0) {
input.outputFormats = defaults.outputFormats as OutputFormats[];
log.warning(`The \`outputFormats\` parameter must be a non-empty array. Using default value \`${defaults.outputFormats}\`.`);
input.outputFormats = inputSchema.properties.outputFormats.default as OutputFormats[];
log.info(`The \`outputFormats\` parameter is not defined. Using default value \`${input.outputFormats}\`.`);
} else if (input.outputFormats.some((format) => !['text', 'markdown', 'html'].includes(format))) {
throw new UserInputError('The `outputFormats` array may only contain `text`, `markdown`, or `html`.');
}
if (input.serpProxyGroup !== 'GOOGLE_SERP' && input.serpProxyGroup !== 'SHADER') {

// Request timout seconds
input.requestTimeoutSecs = validateRange(
input.requestTimeoutSecs,
inputSchema.properties.requestTimeoutSecs.minimum,
inputSchema.properties.requestTimeoutSecs.maximum,
inputSchema.properties.requestTimeoutSecs.default,
'requestTimeoutSecs',
);

// SERP proxy group
if (!input.serpProxyGroup || input.serpProxyGroup.length === 0) {
input.serpProxyGroup = inputSchema.properties.serpProxyGroup.default as SERPProxyGroup;
} else if (input.serpProxyGroup !== 'GOOGLE_SERP' && input.serpProxyGroup !== 'SHADER') {
throw new UserInputError('The `serpProxyGroup` parameter must be either `GOOGLE_SERP` or `SHADER`.');
}
if (input.dynamicContentWaitSecs >= input.requestTimeoutSecs) {
input.dynamicContentWaitSecs = Math.round(input.requestTimeoutSecs / 2);

// SERP max retries
input.serpMaxRetries = validateRange(
input.serpMaxRetries,
inputSchema.properties.serpMaxRetries.minimum,
inputSchema.properties.serpMaxRetries.maximum,
inputSchema.properties.serpMaxRetries.default,
'serpMaxRetries',
);

// Proxy configuration
if (!input.proxyConfiguration) {
input.proxyConfiguration = inputSchema.properties.proxyConfiguration.default as ProxyConfigurationOptions;
}
if (input.scrapingTool !== 'browser-playwright' && input.scrapingTool !== 'raw-http') {

// Scraping tool
if (!input.scrapingTool) {
input.scrapingTool = inputSchema.properties.scrapingTool.default as ScrapingTool;
} else if (input.scrapingTool !== 'browser-playwright' && input.scrapingTool !== 'raw-http') {
throw new UserInputError('The `scrapingTool` parameter must be either `browser-playwright` or `raw-http`.');
}

// Remove elements CSS selector
if (!input.removeElementsCssSelector) {
input.removeElementsCssSelector = inputSchema.properties.removeElementsCssSelector.default;
}

// HTML transformer
if (!input.htmlTransformer) {
input.htmlTransformer = inputSchema.properties.htmlTransformer.default;
}

// Desired concurrency
input.desiredConcurrency = validateRange(
input.desiredConcurrency,
inputSchema.properties.desiredConcurrency.minimum,
inputSchema.properties.desiredConcurrency.maximum,
inputSchema.properties.desiredConcurrency.default,
'desiredConcurrency',
);

// Max request retries
input.maxRequestRetries = validateRange(
input.maxRequestRetries,
inputSchema.properties.maxRequestRetries.minimum,
inputSchema.properties.maxRequestRetries.maximum,
inputSchema.properties.maxRequestRetries.default,
'maxRequestRetries',
);

// Dynamic content wait seconds
if (!input.dynamicContentWaitSecs || input.dynamicContentWaitSecs >= input.requestTimeoutSecs) {
input.dynamicContentWaitSecs = Math.round(input.requestTimeoutSecs / 2);
}

// Remove cookie warnings
if (input.removeCookieWarnings === undefined) {
input.removeCookieWarnings = inputSchema.properties.removeCookieWarnings.default;
}

// Debug mode
if (input.debugMode === undefined) {
input.debugMode = inputSchema.properties.debugMode.default;
}

return input as Input;
}
3 changes: 1 addition & 2 deletions src/main.ts
Original file line number Diff line number Diff line change
Expand Up @@ -80,15 +80,14 @@ if (standbyMode) {
`);

app.listen(port, async () => {
log.info(`The Actor web server is listening for user requests at ${host}:${port}`);

const promises: Promise<unknown>[] = [];
promises.push(createAndStartSearchCrawler(searchCrawlerOptions));
for (const settings of contentCrawlerOptions) {
promises.push(createAndStartContentCrawler(settings));
}

await Promise.all(promises);
log.info(`The Actor web server is listening for user requests at ${host}:${port}`);
});
} else {
log.info('Actor is running in the NORMAL mode.');
Expand Down
6 changes: 4 additions & 2 deletions src/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ async function runSearchProcess(params: Partial<Input>): Promise<Output[]> {
contentScraperSettings,
} = await processInput(params);

// Set keepAlive to true to find the correct crawlers
searchCrawlerOptions.keepAlive = true;
contentCrawlerOptions.crawlerOptions.keepAlive = true;

await createAndStartSearchCrawler(searchCrawlerOptions);
const { key: contentCrawlerKey } = await createAndStartContentCrawler(contentCrawlerOptions);

Expand Down Expand Up @@ -137,8 +141,6 @@ export async function handleSearchNormalMode(input: Input,
contentScraperSettings: ContentScraperSettings,
) {
const startedTime = Date.now();
searchCrawlerOptions.keepAlive = false;
contentCrawlerOptions.crawlerOptions.keepAlive = false;
contentCrawlerOptions.crawlerOptions.requestHandlerTimeoutSecs = PLAYWRIGHT_REQUEST_TIMEOUT_NORMAL_MODE_SECS;

const { crawler: searchCrawler } = await createAndStartSearchCrawler(searchCrawlerOptions, false);
Expand Down
Loading
Loading