Skip to content

Commit ef56aac

Browse files
authored
fix: cancel requests only in standby (#68)
* Revert "chore: Revert "fix: cancel timed out requests (#65)" (#67)" This reverts commit 7d66686. * only cancel requests for standby actors * Update CHANGELOG.md * Update CHANGELOG.md
1 parent 7d66686 commit ef56aac

File tree

5 files changed

+43
-7
lines changed

5 files changed

+43
-7
lines changed

Diff for: CHANGELOG.md

+10
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,15 @@
11
This changelog summarizes all changes of the RAG Web Browser
22

3+
### 1.0.15 (2025-03-27)
4+
5+
🐛 Bug Fixes
6+
- Cancel requests only in standby mode
7+
8+
### 1.0.13 (2025-03-27)
9+
10+
🐛 Bug Fixes
11+
- Cancel crawling requests from timed-out search queries
12+
313
### 1.0.12 (2025-03-24)
414

515
🐛 Bug Fixes

Diff for: src/main.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import { RagWebBrowserServer } from './mcp/server.js';
1010
import { addTimeoutToAllResponses } from './responses.js';
1111
import { handleSearchRequest, handleSearchNormalMode } from './search.js';
1212
import { Input } from './types.js';
13+
import { isActorStandby } from './utils.js';
1314

1415
await Actor.init();
1516

@@ -57,10 +58,9 @@ app.use((req, res) => {
5758
res.status(404).json({ message: `The is nothing at route ${req.method} ${req.originalUrl}. ${HELP_MESSAGE}` });
5859
});
5960

60-
const standbyMode = Actor.getEnv().metaOrigin === 'STANDBY';
6161
const originalInput = await Actor.getInput<Partial<Input>>() ?? {} as Input;
6262

63-
if (standbyMode) {
63+
if (isActorStandby()) {
6464
log.info('Actor is running in the STANDBY mode.');
6565

6666
const host = Actor.isAtHome() ? process.env.ACTOR_STANDBY_URL : 'http://localhost';

Diff for: src/request-handler.ts

+24-3
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,9 @@ import { load } from 'cheerio';
33
import { CheerioCrawlingContext, htmlToText, log, PlaywrightCrawlingContext, sleep, Request } from 'crawlee';
44

55
import { ContentCrawlerStatus, ContentCrawlerTypes } from './const.js';
6-
import { addResultToResponse, sendResponseIfFinished } from './responses.js';
6+
import { addResultToResponse, responseData, sendResponseIfFinished } from './responses.js';
77
import { Output, ContentCrawlerUserData } from './types.js';
8-
import { addTimeMeasureEvent, transformTimeMeasuresToRelative } from './utils.js';
8+
import { addTimeMeasureEvent, isActorStandby, transformTimeMeasuresToRelative } from './utils.js';
99
import { processHtml } from './website-content-crawler/html-processing.js';
1010
import { htmlToMarkdown } from './website-content-crawler/markdown.js';
1111

@@ -27,6 +27,22 @@ async function waitForPlaywright({ page }: PlaywrightCrawlingContext, time: numb
2727
return Promise.race([page.waitForLoadState('networkidle', { timeout: 0 }), sleep(time - hardDelay)]);
2828
}
2929

30+
/**
31+
* Checks if the request should time out based on response timeout.
32+
* It verifies if the response data contains the responseId. If not, it sets the request's noRetry flag
33+
* to true and throws an error to cancel the request.
34+
*
35+
* @param {Request} request - The request object to be checked.
36+
* @param {string} responseId - The response ID to look for in the response data.
37+
* @throws {Error} Throws an error if the request times out.
38+
*/
39+
function checkTimeoutAndCancelRequest(request: Request, responseId: string) {
40+
if (!responseData.has(responseId)) {
41+
request.noRetry = true;
42+
throw new Error('Timed out. Cancelling the request...');
43+
}
44+
}
45+
3046
/**
3147
* Decide whether to wait based on the remaining time left for the Actor to run.
3248
* Always waits if the Actor is in the STANDBY_MODE.
@@ -148,7 +164,9 @@ export async function requestHandlerPlaywright(
148164
context: PlaywrightCrawlingContext<ContentCrawlerUserData>,
149165
) {
150166
const { request, response, page, closeCookieModals } = context;
151-
const { contentScraperSettings: settings } = request.userData;
167+
const { contentScraperSettings: settings, responseId } = request.userData;
168+
169+
if (isActorStandby()) checkTimeoutAndCancelRequest(request, responseId);
152170

153171
log.info(`Processing URL: ${request.url}`);
154172
addTimeMeasureEvent(request.userData, 'playwright-request-start');
@@ -180,6 +198,9 @@ export async function requestHandlerCheerio(
180198
context: CheerioCrawlingContext<ContentCrawlerUserData>,
181199
) {
182200
const { $, request, response } = context;
201+
const { responseId } = request.userData;
202+
203+
if (isActorStandby()) checkTimeoutAndCancelRequest(request, responseId);
183204

184205
log.info(`Processing URL: ${request.url}`);
185206
addTimeMeasureEvent(request.userData, 'cheerio-request-start');

Diff for: src/responses.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type ResponseData = {
1111
timeoutId?: NodeJS.Timeout;
1212
};
1313

14-
const responseData = new Map<string, ResponseData>();
14+
export const responseData = new Map<string, ResponseData>();
1515

1616
/**
1717
* Helper function to get response object by responseId.
@@ -39,7 +39,7 @@ export function createResponsePromise(responseId: string, timeoutSecs: number):
3939

4040
// Set a timeout to reject the promise if it takes too long
4141
data.timeoutId = setTimeout(() => {
42-
sendResponseError(responseId, 'Timed out');
42+
sendResponseError(responseId, 'Timed out.');
4343
}, timeoutSecs * 1000);
4444
});
4545
}

Diff for: src/utils.ts

+5
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
1+
import { Actor } from 'apify';
12
import { RequestOptions, log, ProxyConfiguration } from 'crawlee';
23
import { parse, ParsedUrlQuery } from 'querystring';
34

45
import { defaults } from './const.js';
56
import { OrganicResult, ContentScraperSettings, TimeMeasure, ContentCrawlerUserData, SearchCrawlerUserData } from './types.js';
67
import inputSchema from '../.actor/input_schema.json' with { type: 'json' };
78

9+
export function isActorStandby(): boolean {
10+
return Actor.getEnv().metaOrigin === 'STANDBY';
11+
}
12+
813
export function parseParameters(url: string): ParsedUrlQuery {
914
const params = parse(url.slice(1));
1015

0 commit comments

Comments
 (0)