@@ -176,6 +176,9 @@ class DomainMetadataExtractor:
176
176
"Service unavailable" ,
177
177
]
178
178
179
+ # Constants for favicon URL validation
180
+ MANIFEST_JSON_BASE64_MARKER = "/application/manifest+json;base64,"
181
+
179
182
# List of blocked (second level) domains
180
183
blocked_domains : set [str ]
181
184
scraper : Scraper
@@ -225,15 +228,15 @@ async def _extract_favicons(self, scraped_url: str) -> list[dict[str, Any]]:
225
228
226
229
for favicon in favicon_data .links :
227
230
favicon_url = favicon ["href" ]
228
- if favicon_url . startswith ( "data:" ):
231
+ if self . _is_problematic_favicon_url ( favicon_url ):
229
232
continue
230
233
if not favicon_url .startswith ("http" ) and not favicon_url .startswith ("//" ):
231
234
favicon ["href" ] = urljoin (scraped_url , favicon_url )
232
235
favicons .append (favicon )
233
236
234
237
for favicon in favicon_data .metas :
235
238
favicon_url = favicon ["content" ]
236
- if favicon_url . startswith ( "data:" ):
239
+ if self . _is_problematic_favicon_url ( favicon_url ):
237
240
continue
238
241
if not favicon_url .startswith ("http" ) and not favicon_url .startswith ("//" ):
239
242
favicon ["href" ] = urljoin (scraped_url , favicon_url )
@@ -254,8 +257,8 @@ async def _extract_favicons(self, scraped_url: str) -> list[dict[str, Any]]:
254
257
manifest_urls .append (manifest_absolute_url )
255
258
256
259
if manifest_tasks :
257
- # Use smaller chunk size for manifest tasks to limit resource usage
258
- chunk_size = 10
260
+ # Reduce chunk size for manifest tasks to limit resource consumption
261
+ chunk_size = 5
259
262
for i in range (0 , len (manifest_tasks ), chunk_size ):
260
263
chunk = manifest_tasks [i : i + chunk_size ]
261
264
chunk_urls = manifest_urls [i : i + chunk_size ]
@@ -280,6 +283,11 @@ async def _extract_favicons(self, scraped_url: str) -> list[dict[str, Any]]:
280
283
for scraped_favicon in scraped_favicons_result :
281
284
# Check if the favicon URL already contains a scheme
282
285
favicon_src = scraped_favicon .get ("src" , "" )
286
+
287
+ # Skip problematic data URLs or invalid formats
288
+ if self ._is_problematic_favicon_url (favicon_src ):
289
+ continue
290
+
283
291
if favicon_src .startswith (("http://" , "https://" )):
284
292
favicon_url = favicon_src
285
293
else :
@@ -305,39 +313,47 @@ async def _get_best_favicon(self, favicons: list[dict[str, Any]], min_width: int
305
313
best_favicon_url = ""
306
314
best_favicon_width = 0
307
315
308
- # Process favicons in chunks to limit concurrent connections
309
- chunk_size = 20
310
- all_favicon_images = []
316
+ # Process favicons in smaller chunks to limit concurrent connections and memory usage
317
+ chunk_size = 10
311
318
312
- for chunk_urls in itertools .batched (urls , chunk_size ):
319
+ for chunk_idx , chunk_urls in enumerate ( itertools .batched (urls , chunk_size ) ):
313
320
chunk_images = await self .favicon_downloader .download_multiple_favicons (
314
321
list (chunk_urls )
315
322
)
316
- all_favicon_images .extend (chunk_images )
317
323
318
- favicon_images = all_favicon_images
324
+ # Calculate the offset in the favicons list for this chunk
325
+ favicon_offset = chunk_idx * chunk_size
326
+
327
+ # Process this chunk immediately
328
+ for i , (image , url ) in enumerate (zip (chunk_images , chunk_urls )):
329
+ if image is None or "image/" not in image .content_type :
330
+ continue
331
+
332
+ # First priority: If favicon is an SVG and not masked, select it immediately
333
+ if (
334
+ image .content_type == "image/svg+xml"
335
+ and (i + favicon_offset ) not in masked_svg_indices
336
+ ):
337
+ # Clear variables to help with garbage collection
338
+ del chunk_images
319
339
320
- # First pass: Look for SVG favicons (they are priority)
321
- for i , (favicon , image ) in enumerate (zip (favicons , favicon_images )):
322
- if image is None or "image/" not in image .content_type :
323
- continue
340
+ # Return immediately on finding a good SVG
341
+ return url
324
342
325
- # If favicon is an SVG and not masked, return it immediately
326
- if image .content_type == "image/svg+xml" and i not in masked_svg_indices :
327
- return urls [i ]
343
+ # Second priority: Track the highest resolution bitmap favicon
344
+ try :
345
+ width = self ._get_favicon_smallest_dimension (image )
346
+ if width > best_favicon_width :
347
+ best_favicon_url = url
348
+ best_favicon_width = width
349
+ except Exception as e :
350
+ logger .warning (f"Exception { e } for favicon at position { i + favicon_offset } " )
328
351
329
- # Second pass: Look for the highest resolution bitmap favicon
330
- for i , (favicon , image ) in enumerate (zip (favicons , favicon_images )):
331
- if image is None or "image/" not in image .content_type :
332
- continue
352
+ # Explicitly clear chunk_images to free memory immediately
353
+ del chunk_images
333
354
334
- try :
335
- width = self ._get_favicon_smallest_dimension (image )
336
- if width > best_favicon_width :
337
- best_favicon_url = urls [i ]
338
- best_favicon_width = width
339
- except Exception as e :
340
- logger .warning (f"Exception { e } for favicon { favicon } " )
355
+ # Add a delay between batches to prevent network resource exhaustion
356
+ await asyncio .sleep (0.5 )
341
357
342
358
logger .debug (f"Best favicon url: { best_favicon_url } , width: { best_favicon_width } " )
343
359
return best_favicon_url if best_favicon_width >= min_width else ""
@@ -348,10 +364,15 @@ async def _get_favicon(self, scraped_url: str, min_width: int) -> str:
348
364
with the highest resolution.
349
365
"""
350
366
favicons : list [dict [str , Any ]] = await self ._extract_favicons (scraped_url )
351
- logger .info (
352
- f"{ len (favicons )} favicons extracted for { scraped_url } . Favicons are: { favicons } "
353
- )
354
- return await self ._get_best_favicon (favicons , min_width )
367
+ logger .info (f"{ len (favicons )} favicons extracted for { scraped_url } " )
368
+
369
+ # Get the best favicon
370
+ result = await self ._get_best_favicon (favicons , min_width )
371
+
372
+ # Explicitly clear the favicons list to free memory
373
+ favicons .clear ()
374
+
375
+ return result
355
376
356
377
def _extract_title (self ) -> Optional [str ]:
357
378
"""Extract title for a url"""
@@ -379,6 +400,10 @@ def _is_domain_blocked(self, domain: str, suffix: str) -> bool:
379
400
second_level_domain : str = self ._get_second_level_domain (domain , suffix )
380
401
return second_level_domain in self .blocked_domains
381
402
403
+ def _is_problematic_favicon_url (self , favicon_url : str ) -> bool :
404
+ """Check if a favicon URL is problematic (data URL or base64 manifest)"""
405
+ return favicon_url .startswith ("data:" ) or self .MANIFEST_JSON_BASE64_MARKER in favicon_url
406
+
382
407
def get_domain_metadata (
383
408
self , domains_data : list [dict [str , Any ]], favicon_min_width : int
384
409
) -> list [dict [str , Optional [str ]]]:
@@ -389,21 +414,30 @@ async def _process_domains_async(
389
414
self , domains_data : list [dict [str , Any ]], favicon_min_width : int
390
415
) -> list [dict [str , Optional [str ]]]:
391
416
"""Process domains in chunks to limit resource consumption."""
392
- chunk_size = 100
417
+ # Reduce batch size to decrease memory consumption and network load
418
+ chunk_size = 25
393
419
filtered_results : list [dict [str , Optional [str ]]] = []
394
420
395
421
for i in range (0 , len (domains_data ), chunk_size ):
396
- chunk = domains_data [i : i + chunk_size ]
422
+ end_idx = min (i + chunk_size , len (domains_data ))
423
+ chunk = domains_data [i :end_idx ]
397
424
tasks = [
398
425
self ._process_single_domain (domain_data , favicon_min_width )
399
426
for domain_data in chunk
400
427
]
401
428
logger .info (
402
429
f"Processing chunk of { len (chunk )} domains concurrently "
403
- f"({ i + 1 } -{ min ( i + chunk_size , len ( domains_data )) } of { len (domains_data )} )"
430
+ f"({ i + 1 } -{ end_idx } of { len (domains_data )} )"
404
431
)
432
+
433
+ # Process current chunk with gather
405
434
chunk_results = await asyncio .gather (* tasks , return_exceptions = True )
406
435
436
+ # Add a longer delay between chunks to allow system resources to recover
437
+ if end_idx < len (domains_data ):
438
+ await asyncio .sleep (2.0 )
439
+
440
+ # Process results
407
441
for result in chunk_results :
408
442
if isinstance (result , Exception ):
409
443
logger .error (f"Error processing domain: { result } " )
0 commit comments