@@ -3,9 +3,9 @@ import { load } from 'cheerio';
3
3
import { CheerioCrawlingContext , htmlToText , log , PlaywrightCrawlingContext , sleep , Request } from 'crawlee' ;
4
4
5
5
import { ContentCrawlerStatus , ContentCrawlerTypes } from './const.js' ;
6
- import { addResultToResponse , sendResponseIfFinished } from './responses.js' ;
6
+ import { addResultToResponse , responseData , sendResponseIfFinished } from './responses.js' ;
7
7
import { Output , ContentCrawlerUserData } from './types.js' ;
8
- import { addTimeMeasureEvent , transformTimeMeasuresToRelative } from './utils.js' ;
8
+ import { addTimeMeasureEvent , isActorStandby , transformTimeMeasuresToRelative } from './utils.js' ;
9
9
import { processHtml } from './website-content-crawler/html-processing.js' ;
10
10
import { htmlToMarkdown } from './website-content-crawler/markdown.js' ;
11
11
@@ -27,6 +27,22 @@ async function waitForPlaywright({ page }: PlaywrightCrawlingContext, time: numb
27
27
return Promise . race ( [ page . waitForLoadState ( 'networkidle' , { timeout : 0 } ) , sleep ( time - hardDelay ) ] ) ;
28
28
}
29
29
30
+ /**
31
+ * Checks if the request should time out based on response timeout.
32
+ * It verifies if the response data contains the responseId. If not, it sets the request's noRetry flag
33
+ * to true and throws an error to cancel the request.
34
+ *
35
+ * @param {Request } request - The request object to be checked.
36
+ * @param {string } responseId - The response ID to look for in the response data.
37
+ * @throws {Error } Throws an error if the request times out.
38
+ */
39
+ function checkTimeoutAndCancelRequest ( request : Request , responseId : string ) {
40
+ if ( ! responseData . has ( responseId ) ) {
41
+ request . noRetry = true ;
42
+ throw new Error ( 'Timed out. Cancelling the request...' ) ;
43
+ }
44
+ }
45
+
30
46
/**
31
47
* Decide whether to wait based on the remaining time left for the Actor to run.
32
48
* Always waits if the Actor is in the STANDBY_MODE.
@@ -148,7 +164,9 @@ export async function requestHandlerPlaywright(
148
164
context : PlaywrightCrawlingContext < ContentCrawlerUserData > ,
149
165
) {
150
166
const { request, response, page, closeCookieModals } = context ;
151
- const { contentScraperSettings : settings } = request . userData ;
167
+ const { contentScraperSettings : settings , responseId } = request . userData ;
168
+
169
+ if ( isActorStandby ( ) ) checkTimeoutAndCancelRequest ( request , responseId ) ;
152
170
153
171
log . info ( `Processing URL: ${ request . url } ` ) ;
154
172
addTimeMeasureEvent ( request . userData , 'playwright-request-start' ) ;
@@ -180,6 +198,9 @@ export async function requestHandlerCheerio(
180
198
context : CheerioCrawlingContext < ContentCrawlerUserData > ,
181
199
) {
182
200
const { $, request, response } = context ;
201
+ const { responseId } = request . userData ;
202
+
203
+ if ( isActorStandby ( ) ) checkTimeoutAndCancelRequest ( request , responseId ) ;
183
204
184
205
log . info ( `Processing URL: ${ request . url } ` ) ;
185
206
addTimeMeasureEvent ( request . userData , 'cheerio-request-start' ) ;
0 commit comments