Skip to content

Commit 81d7875

Browse files
authored
DISCO-3410 Improve fixing the urls of favicons with no proper url (#864)
1 parent d810b36 commit 81d7875

File tree

2 files changed

+16
-8
lines changed

2 files changed

+16
-8
lines changed

merino/jobs/navigational_suggestions/domain_metadata_extractor.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -200,17 +200,23 @@ def _get_base_url(self, url: str) -> str:
200200
return f"{parsed_url.scheme}://{parsed_url.hostname}"
201201

202202
def _fix_url(self, url: str) -> str:
203-
"""Return a url with https scheme if the scheme is originally missing from it"""
204-
# Handle protocol-relative URLs (starting with //)
203+
# Skip empty URLs or single slash
204+
if not url or url == "/":
205+
return ""
206+
207+
# Handle protocol-relative URLs
205208
if url.startswith("//"):
206209
return f"https:{url}"
207-
# Handle URLs without protocol but with domain name structure
210+
# Handle URLs without protocol
208211
elif not url.startswith(("http://", "https://")) and not url.startswith("/"):
209212
return f"https://{url}"
210-
# Handle absolute paths without domain by keeping the format consistent
211-
# with how the calling code expects it
213+
# Handle absolute paths with base URL context
212214
elif not url.startswith(("http://", "https://")) and url.startswith("/"):
213-
return f"https:{url}"
215+
# We need real URL joining here, not string concatenation
216+
if hasattr(self, "_current_base_url") and self._current_base_url:
217+
return urljoin(self._current_base_url, url)
218+
else:
219+
return ""
214220
# Return unchanged URLs that already have a protocol
215221
return url
216222

@@ -222,6 +228,8 @@ def _get_favicon_smallest_dimension(self, image: Image) -> int:
222228
async def _extract_favicons(self, scraped_url: str) -> list[dict[str, Any]]:
223229
"""Extract all favicons for an already opened url asynchronously"""
224230
logger.info(f"Extracting all favicons for {scraped_url}")
231+
# Store the base URL for resolving relative paths
232+
self._current_base_url = scraped_url
225233
favicons: list[dict[str, Any]] = []
226234
try:
227235
favicon_data: FaviconData = self.scraper.scrape_favicon_data(scraped_url)

tests/unit/jobs/navigational_suggestions/test_domain_metadata_extractor.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -879,8 +879,8 @@ def test_fix_url() -> None:
879879
# Test with protocol-relative URLs - should add https: prefix (keeping // intact)
880880
assert extractor._fix_url("//example.com/icon.ico") == "https://example.com/icon.ico"
881881

882-
# Test with absolute paths - should add https: prefix (keeping / intact)
883-
assert extractor._fix_url("/icon.ico") == "https:/icon.ico"
882+
# Test with absolute paths - when no _current_base_url is set, should return empty string
883+
assert extractor._fix_url("/icon.ico") == ""
884884

885885
# Test with domain names without protocol - should add https:// prefix
886886
assert extractor._fix_url("example.com/icon.ico") == "https://example.com/icon.ico"

0 commit comments

Comments
 (0)