From d50e596f26c58d6e275fcdb2b297ee150d7d4f94 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 21 Mar 2026 04:03:51 +0000 Subject: [PATCH 1/4] Initial plan From 87ce27d453c9bffedb4771593f3082be9a417279 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sat, 21 Mar 2026 04:10:51 +0000 Subject: [PATCH 2/4] Extract shared _sanitize_url to url_utils.py, simplify to space-only encoding, fix test robustness Co-authored-by: bpulluta <115118857+bpulluta@users.noreply.github.com> Agent-Logs-Url: https://github.com/NatLabRockies/COMPASS/sessions/ceb782b4-c312-41d1-b4eb-eccbbef67097 --- compass/scripts/download.py | 25 +---------------- compass/web/url_utils.py | 20 +++++++++++++ compass/web/website_crawl.py | 37 ++----------------------- tests/python/unit/web/test_web_crawl.py | 1 + 4 files changed, 24 insertions(+), 59 deletions(-) create mode 100644 compass/web/url_utils.py diff --git a/compass/scripts/download.py b/compass/scripts/download.py index 9199eed8..213ada92 100644 --- a/compass/scripts/download.py +++ b/compass/scripts/download.py @@ -2,14 +2,6 @@ import logging from contextlib import AsyncExitStack -from urllib.parse import ( - parse_qsl, - quote, - unquote, - urlencode, - urlparse, - urlunparse, -) from elm.web.document import PDFDocument from elm.web.search.run import ( @@ -32,6 +24,7 @@ JurisdictionWebsiteValidator, ) from compass.web.website_crawl import COMPASSCrawler, COMPASSLinkScorer +from compass.web.url_utils import _sanitize_url from compass.utilities.enums import LLMTasks from compass.utilities.io import load_local_docs from compass.pb import COMPASS_PB @@ -804,22 +797,6 @@ async def _contains_relevant_text( return found_text -def _sanitize_url(url): - """Percent-encode spaces and unsafe characters in a URL path""" - parsed = urlparse(url) - safe_path = quote(unquote(parsed.path), safe="/") - query_params = parse_qsl(parsed.query, keep_blank_values=True) - safe_query = urlencode(query_params, doseq=True) # cspell: disable-line - return urlunparse(( - parsed.scheme, - parsed.netloc, - safe_path, - parsed.params, - safe_query, - parsed.fragment, - )) - - def _sanitize_doc_sources(docs): """Rewrite source attrs on documents returned by ELMWebsiteCrawler diff --git a/compass/web/url_utils.py b/compass/web/url_utils.py new file mode 100644 index 00000000..31aee68c --- /dev/null +++ b/compass/web/url_utils.py @@ -0,0 +1,20 @@ +"""Shared URL utilities for COMPASS web modules""" + +from urllib.parse import urlparse, urlunparse + + +def _sanitize_url(url): + """Encode spaces in a URL path; leave query string intact""" + parsed = urlparse(url) + path = parsed.path + safe_path = path.replace(" ", "%20") if " " in path else path + return urlunparse( + ( + parsed.scheme, + parsed.netloc, + safe_path, + parsed.params, + parsed.query, + parsed.fragment, + ) + ) diff --git a/compass/web/website_crawl.py b/compass/web/website_crawl.py index a5fd2093..ccda23ff 100644 --- a/compass/web/website_crawl.py +++ b/compass/web/website_crawl.py @@ -9,15 +9,7 @@ import operator from collections import Counter from contextlib import AsyncExitStack -from urllib.parse import ( - urlparse, - urlunparse, - quote, - unquote, - parse_qsl, - urlencode, - urljoin, -) +from urllib.parse import urljoin from crawl4ai.models import Link as c4AILink from bs4 import BeautifulSoup @@ -28,6 +20,7 @@ from elm.web.document import PDFDocument, HTMLDocument from elm.web.file_loader import AsyncWebFileLoader from elm.web.website_crawl import ELMLinkScorer, _SCORE_KEY # noqa: PLC2701 +from compass.web.url_utils import _sanitize_url logger = logging.getLogger(__name__) @@ -495,32 +488,6 @@ def _debug_info_on_links(links): logger.debug(" ...") -def _sanitize_url(url): - """Fix common URL issues - - - Encode spaces and unsafe characters in the path - - Encode query parameters safely - - Leave existing percent-encoding intact - """ - parsed = urlparse(url) - - safe_path = quote(unquote(parsed.path), safe="/") - - query_params = parse_qsl(parsed.query, keep_blank_values=True) - safe_query = urlencode(query_params, doseq=True) # cspell: disable-line - - return urlunparse( - ( - parsed.scheme, - parsed.netloc, - safe_path, - parsed.params, - safe_query, - parsed.fragment, - ) - ) - - def _extract_links_from_html(text, base_url): """Parse HTML and extract all links""" soup = BeautifulSoup(text, "html.parser") diff --git a/tests/python/unit/web/test_web_crawl.py b/tests/python/unit/web/test_web_crawl.py index 24ae0e61..30dd2a86 100644 --- a/tests/python/unit/web/test_web_crawl.py +++ b/tests/python/unit/web/test_web_crawl.py @@ -267,6 +267,7 @@ def test_extract_links_from_html_sets_text_from_anchor(): Permit Standards """ links = _extract_links_from_html(html, base_url="https://example.com") + assert len(links) == 1 link = next(iter(links)) assert link.title == "Permit Standards" assert link.text == "Permit Standards" From 69418ca887e9f927e261963471319eabeccfa659 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:10:12 -0600 Subject: [PATCH 3/4] fix failing test --- compass/web/url_utils.py | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/compass/web/url_utils.py b/compass/web/url_utils.py index 31aee68c..fb289877 100644 --- a/compass/web/url_utils.py +++ b/compass/web/url_utils.py @@ -1,20 +1,14 @@ """Shared URL utilities for COMPASS web modules""" -from urllib.parse import urlparse, urlunparse +from urllib.parse import quote, urlsplit, urlunsplit def _sanitize_url(url): - """Encode spaces in a URL path; leave query string intact""" - parsed = urlparse(url) - path = parsed.path - safe_path = path.replace(" ", "%20") if " " in path else path - return urlunparse( - ( - parsed.scheme, - parsed.netloc, - safe_path, - parsed.params, - parsed.query, - parsed.fragment, - ) + """Encode unsafe URL characters while preserving URL semantics""" + parsed = urlsplit(url) + path = quote(parsed.path, safe="/:@-._~!$&'()*+,;=") + query = quote(parsed.query, safe="=&;%:@-._~!$&'()*+,;/?:") + fragment = quote(parsed.fragment, safe="") + return urlunsplit( + (parsed.scheme, parsed.netloc, path, query, fragment) ) From 6c6e950e151d45902076e54d88628cd986fa52d6 Mon Sep 17 00:00:00 2001 From: Byron Pullutasig <115118857+bpulluta@users.noreply.github.com> Date: Fri, 20 Mar 2026 23:13:10 -0600 Subject: [PATCH 4/4] ruff error fix --- compass/web/url_utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/compass/web/url_utils.py b/compass/web/url_utils.py index fb289877..2eeb464d 100644 --- a/compass/web/url_utils.py +++ b/compass/web/url_utils.py @@ -9,6 +9,4 @@ def _sanitize_url(url): path = quote(parsed.path, safe="/:@-._~!$&'()*+,;=") query = quote(parsed.query, safe="=&;%:@-._~!$&'()*+,;/?:") fragment = quote(parsed.fragment, safe="") - return urlunsplit( - (parsed.scheme, parsed.netloc, path, query, fragment) - ) + return urlunsplit((parsed.scheme, parsed.netloc, path, query, fragment))