Skip to content

Commit d9eddc7

Browse files
authored
fix: cancel timed out requests (#65)
* return dataset it on request timeout * Handle timed out responses and cancel requests * Add BoundedArray class to manage timed out responses * Remove BoundedArray and refactor timeout handling to use responseData * Remove unused constant TIMED_OUT_RESPONSE_ARRAY_SIZE * Update CHANGELOG for version 1.0.13
1 parent 824ea3a commit d9eddc7

File tree

3 files changed

+30
-4
lines changed

3 files changed

+30
-4
lines changed

Diff for: CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
11
This changelog summarizes all changes of the RAG Web Browser
22

3+
### 1.0.13 (2025-03-27)
4+
5+
🐛 Bug Fixes
6+
- Cancel crawling requests from timed-out search queries
7+
38
### 1.0.12 (2025-03-24)
49

510
🐛 Bug Fixes

Diff for: src/request-handler.ts

+23-2
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ import { load } from 'cheerio';
33
import { CheerioCrawlingContext, htmlToText, log, PlaywrightCrawlingContext, sleep, Request } from 'crawlee';
44

55
import { ContentCrawlerStatus, ContentCrawlerTypes } from './const.js';
6-
import { addResultToResponse, sendResponseIfFinished } from './responses.js';
6+
import { addResultToResponse, responseData, sendResponseIfFinished } from './responses.js';
77
import { Output, ContentCrawlerUserData } from './types.js';
88
import { addTimeMeasureEvent, transformTimeMeasuresToRelative } from './utils.js';
99
import { processHtml } from './website-content-crawler/html-processing.js';
@@ -27,6 +27,22 @@ async function waitForPlaywright({ page }: PlaywrightCrawlingContext, time: numb
2727
return Promise.race([page.waitForLoadState('networkidle', { timeout: 0 }), sleep(time - hardDelay)]);
2828
}
2929

30+
/**
31+
* Checks if the request should time out based on response timeout.
32+
* It verifies if the response data contains the responseId. If not, it sets the request's noRetry flag
33+
* to true and throws an error to cancel the request.
34+
*
35+
* @param {Request} request - The request object to be checked.
36+
* @param {string} responseId - The response ID to look for in the response data.
37+
* @throws {Error} Throws an error if the request times out.
38+
*/
39+
function checkTimeoutAndCancelRequest(request: Request, responseId: string) {
40+
if (!responseData.has(responseId)) {
41+
request.noRetry = true;
42+
throw new Error('Timed out. Cancelling the request...');
43+
}
44+
}
45+
3046
/**
3147
* Decide whether to wait based on the remaining time left for the Actor to run.
3248
* Always waits if the Actor is in the STANDBY_MODE.
@@ -148,7 +164,9 @@ export async function requestHandlerPlaywright(
148164
context: PlaywrightCrawlingContext<ContentCrawlerUserData>,
149165
) {
150166
const { request, response, page, closeCookieModals } = context;
151-
const { contentScraperSettings: settings } = request.userData;
167+
const { contentScraperSettings: settings, responseId } = request.userData;
168+
169+
checkTimeoutAndCancelRequest(request, responseId);
152170

153171
log.info(`Processing URL: ${request.url}`);
154172
addTimeMeasureEvent(request.userData, 'playwright-request-start');
@@ -180,6 +198,9 @@ export async function requestHandlerCheerio(
180198
context: CheerioCrawlingContext<ContentCrawlerUserData>,
181199
) {
182200
const { $, request, response } = context;
201+
const { responseId } = request.userData;
202+
203+
checkTimeoutAndCancelRequest(request, responseId);
183204

184205
log.info(`Processing URL: ${request.url}`);
185206
addTimeMeasureEvent(request.userData, 'cheerio-request-start');

Diff for: src/responses.ts

+2-2
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ type ResponseData = {
1111
timeoutId?: NodeJS.Timeout;
1212
};
1313

14-
const responseData = new Map<string, ResponseData>();
14+
export const responseData = new Map<string, ResponseData>();
1515

1616
/**
1717
* Helper function to get response object by responseId.
@@ -39,7 +39,7 @@ export function createResponsePromise(responseId: string, timeoutSecs: number):
3939

4040
// Set a timeout to reject the promise if it takes too long
4141
data.timeoutId = setTimeout(() => {
42-
sendResponseError(responseId, 'Timed out');
42+
sendResponseError(responseId, 'Timed out.');
4343
}, timeoutSecs * 1000);
4444
});
4545
}

0 commit comments

Comments
 (0)