-
Notifications
You must be signed in to change notification settings - Fork 17
Enhancing URL detection #57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
0dbaa66
3f2711b
30893a6
7ce778f
78ba233
bb55585
9ad835d
0dd7311
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -27,14 +27,21 @@ | |||||
| from typing import Any | ||||||
| from urllib.parse import ParseResult, urlparse | ||||||
|
|
||||||
| from pydantic import BaseModel, Field | ||||||
| from pydantic import BaseModel, Field, field_validator | ||||||
|
|
||||||
| from guardrails.registry import default_spec_registry | ||||||
| from guardrails.spec import GuardrailSpecMetadata | ||||||
| from guardrails.types import GuardrailResult | ||||||
|
|
||||||
| __all__ = ["urls"] | ||||||
|
|
||||||
| DEFAULT_PORTS = { | ||||||
| "http": 80, | ||||||
| "https": 443, | ||||||
| } | ||||||
|
|
||||||
| SCHEME_PREFIX_RE = re.compile(r"^[a-z][a-z0-9+.-]*://") | ||||||
|
|
||||||
|
|
||||||
| @dataclass(frozen=True, slots=True) | ||||||
| class UrlDetectionResult: | ||||||
|
|
@@ -66,9 +73,54 @@ class URLConfig(BaseModel): | |||||
| description="Allow subdomains of allowed domains (e.g. api.example.com if example.com is allowed)", | ||||||
| ) | ||||||
|
|
||||||
| @field_validator("allowed_schemes", mode="before") | ||||||
| @classmethod | ||||||
| def normalize_allowed_schemes(cls, value: Any) -> set[str]: | ||||||
| """Normalize allowed schemes to bare identifiers without delimiters.""" | ||||||
| if value is None: | ||||||
| return {"https"} | ||||||
|
|
||||||
| if isinstance(value, str): | ||||||
| raw_values = [value] | ||||||
| else: | ||||||
| raw_values = list(value) | ||||||
|
|
||||||
| normalized: set[str] = set() | ||||||
| for entry in raw_values: | ||||||
| if not isinstance(entry, str): | ||||||
| raise TypeError("allowed_schemes entries must be strings") | ||||||
| cleaned = entry.strip().lower() | ||||||
| if not cleaned: | ||||||
| continue | ||||||
| # Support inputs like "https://", "HTTPS:", or " https " | ||||||
| if cleaned.endswith("://"): | ||||||
| cleaned = cleaned[:-3] | ||||||
| cleaned = cleaned.removesuffix(":") | ||||||
| cleaned = cleaned.strip() | ||||||
steven10a marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
| if cleaned: | ||||||
| normalized.add(cleaned) | ||||||
|
|
||||||
| if not normalized: | ||||||
| raise ValueError("allowed_schemes must include at least one scheme") | ||||||
|
|
||||||
| return normalized | ||||||
|
|
||||||
|
|
||||||
| def _detect_urls(text: str) -> list[str]: | ||||||
| """Detect URLs using regex.""" | ||||||
| """Detect URLs using regex patterns with deduplication. | ||||||
|
|
||||||
| Detects URLs with explicit schemes (http, https, ftp, data, javascript, | ||||||
| vbscript), domain-like patterns without schemes, and IP addresses. | ||||||
| Deduplicates to avoid returning both scheme-ful and scheme-less versions | ||||||
| of the same URL. | ||||||
|
|
||||||
| Args: | ||||||
| text: The text to scan for URLs. | ||||||
|
|
||||||
| Returns: | ||||||
| List of unique URL strings found in the text, with trailing | ||||||
| punctuation removed. | ||||||
| """ | ||||||
| # Pattern for cleaning trailing punctuation (] must be escaped) | ||||||
| PUNCTUATION_CLEANUP = r"[.,;:!?)\]]+$" | ||||||
|
|
||||||
|
|
@@ -156,7 +208,20 @@ def _detect_urls(text: str) -> list[str]: | |||||
|
|
||||||
|
|
||||||
| def _validate_url_security(url_string: str, config: URLConfig) -> tuple[ParseResult | None, str]: | ||||||
| """Validate URL using stdlib urllib.parse.""" | ||||||
| """Validate URL security properties using urllib.parse. | ||||||
|
|
||||||
| Checks URL structure, validates the scheme is allowed, and ensures no | ||||||
| credentials are embedded in userinfo if block_userinfo is enabled. | ||||||
|
|
||||||
| Args: | ||||||
| url_string: The URL string to validate. | ||||||
| config: Configuration specifying allowed schemes and userinfo policy. | ||||||
|
|
||||||
| Returns: | ||||||
| A tuple of (parsed_url, error_reason). If validation succeeds, | ||||||
| parsed_url is a ParseResult and error_reason is empty. If validation | ||||||
| fails, parsed_url is None and error_reason describes the failure. | ||||||
| """ | ||||||
| try: | ||||||
| # Parse URL - preserve original scheme for validation | ||||||
| if "://" in url_string: | ||||||
|
|
@@ -185,7 +250,7 @@ def _validate_url_security(url_string: str, config: URLConfig) -> tuple[ParseRes | |||||
| if original_scheme not in config.allowed_schemes: | ||||||
| return None, f"Blocked scheme: {original_scheme}" | ||||||
|
|
||||||
| if config.block_userinfo and parsed_url.username: | ||||||
| if config.block_userinfo and (parsed_url.username or parsed_url.password): | ||||||
| return None, "Contains userinfo (potential credential injection)" | ||||||
|
|
||||||
| # Everything else (IPs, localhost, private IPs) goes through allow list logic | ||||||
|
|
@@ -203,7 +268,20 @@ def _validate_url_security(url_string: str, config: URLConfig) -> tuple[ParseRes | |||||
|
|
||||||
|
|
||||||
| def _is_url_allowed(parsed_url: ParseResult, allow_list: list[str], allow_subdomains: bool) -> bool: | ||||||
| """Check if URL is allowed.""" | ||||||
| """Check if parsed URL matches any entry in the allow list. | ||||||
|
|
||||||
| Supports domain names, IP addresses, CIDR blocks, and full URLs with | ||||||
| paths/ports/query strings. Allow list entries without explicit schemes | ||||||
| match any scheme. Entries with schemes must match exactly. | ||||||
|
|
||||||
| Args: | ||||||
| parsed_url: The parsed URL to check. | ||||||
| allow_list: List of allowed URL patterns (domains, IPs, CIDR, full URLs). | ||||||
| allow_subdomains: If True, subdomains of allowed domains are permitted. | ||||||
|
|
||||||
| Returns: | ||||||
| True if the URL matches any allow list entry, False otherwise. | ||||||
| """ | ||||||
| if not allow_list: | ||||||
| return False | ||||||
|
|
||||||
|
|
@@ -212,30 +290,85 @@ def _is_url_allowed(parsed_url: ParseResult, allow_list: list[str], allow_subdom | |||||
| return False | ||||||
|
|
||||||
| url_host = url_host.lower() | ||||||
| url_domain = url_host.replace("www.", "") | ||||||
|
||||||
| url_domain = url_host.replace("www.", "") | |
| url_domain = url_host.removeprefix("www.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
WAI. Same comment as above
steven10a marked this conversation as resolved.
Show resolved
Hide resolved
steven10a marked this conversation as resolved.
Show resolved
Hide resolved
Copilot
AI
Nov 20, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Using replace("www.", "") removes all occurrences of "www." from the hostname, not just the prefix. This could lead to unexpected behavior. For example, "www.www.example.com" would be treated as equivalent to "example.com" when matching allow list entries. Consider using removeprefix("www.") instead to only remove the "www." prefix, or document this behavior explicitly if it's intentional.
| allowed_domain = allowed_host.replace("www.", "") | |
| allowed_domain = allowed_host.removeprefix("www.") |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this is [nit] or actually beneficial. Extra instances of www. would clearly be a typo and does not add anything malicious. Without removing the extra www. we would have higher unnecessary mismatches
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nitpick] Limited default port coverage: The
DEFAULT_PORTSdictionary only includes mappings for HTTP (80) and HTTPS (443), but the code supports additional schemes like FTP, data, javascript, vbscript, and mailto (as seen in the detection patterns and special scheme handling). FTP typically uses port 21 by default. Consider adding FTP's default port or documenting that only HTTP/HTTPS have default port handling.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
[nit]