@@ -200,17 +200,23 @@ def _get_base_url(self, url: str) -> str:
200
200
return f"{ parsed_url .scheme } ://{ parsed_url .hostname } "
201
201
202
202
def _fix_url (self , url : str ) -> str :
203
- """Return a url with https scheme if the scheme is originally missing from it"""
204
- # Handle protocol-relative URLs (starting with //)
203
+ # Skip empty URLs or single slash
204
+ if not url or url == "/" :
205
+ return ""
206
+
207
+ # Handle protocol-relative URLs
205
208
if url .startswith ("//" ):
206
209
return f"https:{ url } "
207
- # Handle URLs without protocol but with domain name structure
210
+ # Handle URLs without protocol
208
211
elif not url .startswith (("http://" , "https://" )) and not url .startswith ("/" ):
209
212
return f"https://{ url } "
210
- # Handle absolute paths without domain by keeping the format consistent
211
- # with how the calling code expects it
213
+ # Handle absolute paths with base URL context
212
214
elif not url .startswith (("http://" , "https://" )) and url .startswith ("/" ):
213
- return f"https:{ url } "
215
+ # We need real URL joining here, not string concatenation
216
+ if hasattr (self , "_current_base_url" ) and self ._current_base_url :
217
+ return urljoin (self ._current_base_url , url )
218
+ else :
219
+ return ""
214
220
# Return unchanged URLs that already have a protocol
215
221
return url
216
222
@@ -222,6 +228,8 @@ def _get_favicon_smallest_dimension(self, image: Image) -> int:
222
228
async def _extract_favicons (self , scraped_url : str ) -> list [dict [str , Any ]]:
223
229
"""Extract all favicons for an already opened url asynchronously"""
224
230
logger .info (f"Extracting all favicons for { scraped_url } " )
231
+ # Store the base URL for resolving relative paths
232
+ self ._current_base_url = scraped_url
225
233
favicons : list [dict [str , Any ]] = []
226
234
try :
227
235
favicon_data : FaviconData = self .scraper .scrape_favicon_data (scraped_url )
0 commit comments